LinuxLists.cc - [PATCH] 2.6.0-test4 -- add context switch counters

2003-08-27 00:58:16

Subject: [PATCH] 2.6.0-test4 -- add context switch counters

Currently, the context switch counters reported by getrusage() are
always zero. The appended patch adds fields to struct task_struct to
count context switches, and adds code to do the counting.

The patch adds 4 longs to struct task struct, and a single addition to
the fast path in schedule().

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.1291 -> 1.1292
# include/linux/sched.h 1.162 -> 1.163
# kernel/fork.c 1.137 -> 1.138
# kernel/sys.c 1.54 -> 1.55
# kernel/sched.c 1.207 -> 1.208
# kernel/exit.c 1.111 -> 1.112
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 03/08/27 [email protected] 1.1292
# Add context switch counters to struct task_struct; add code to
# update them in schedule(), initialise them in copy_mm(), and copy
# them to user space in getrusage().
# --------------------------------------------
#
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Wed Aug 27 10:57:24 2003
+++ b/include/linux/sched.h Wed Aug 27 10:57:24 2003
@@ -391,6 +391,7 @@
struct timer_list real_timer;
struct list_head posix_timers; /* POSIX.1b Interval Timers */
unsigned long utime, stime, cutime, cstime;
+ unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
u64 start_time;
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
diff -Nru a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c Wed Aug 27 10:57:24 2003
+++ b/kernel/exit.c Wed Aug 27 10:57:24 2003
@@ -80,6 +80,8 @@
p->parent->cmin_flt += p->min_flt + p->cmin_flt;
p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
p->parent->cnswap += p->nswap + p->cnswap;
+ p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
+ p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
sched_exit(p);
write_unlock_irq(&tasklist_lock);
spin_unlock(&p->proc_lock);
diff -Nru a/kernel/fork.c b/kernel/fork.c
--- a/kernel/fork.c Wed Aug 27 10:57:24 2003
+++ b/kernel/fork.c Wed Aug 27 10:57:24 2003
@@ -461,6 +461,7 @@
tsk->min_flt = tsk->maj_flt = 0;
tsk->cmin_flt = tsk->cmaj_flt = 0;
tsk->nswap = tsk->cnswap = 0;
+ tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;

tsk->mm = NULL;
tsk->active_mm = NULL;
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c Wed Aug 27 10:57:24 2003
+++ b/kernel/sched.c Wed Aug 27 10:57:24 2003
@@ -1325,8 +1325,9 @@
}
default:
deactivate_task(prev, rq);
+ prev->nvcsw++;
case TASK_RUNNING:
- ;
+ prev->nivcsw++;
}
pick_next_task:
if (unlikely(!rq->nr_running)) {
diff -Nru a/kernel/sys.c b/kernel/sys.c
--- a/kernel/sys.c Wed Aug 27 10:57:24 2003
+++ b/kernel/sys.c Wed Aug 27 10:57:24 2003
@@ -1309,6 +1309,8 @@
case RUSAGE_SELF:
jiffies_to_timeval(p->utime, &r.ru_utime);
jiffies_to_timeval(p->stime, &r.ru_stime);
+ r.ru_nvcsw = p->nvcsw;
+ r.ru_nivcsw = p->nivcsw;
r.ru_minflt = p->min_flt;
r.ru_majflt = p->maj_flt;
r.ru_nswap = p->nswap;
@@ -1316,6 +1318,8 @@
case RUSAGE_CHILDREN:
jiffies_to_timeval(p->cutime, &r.ru_utime);
jiffies_to_timeval(p->cstime, &r.ru_stime);
+ r.ru_nvcsw = p->cnvcsw;
+ r.ru_nivcsw = p->cnivcsw;
r.ru_minflt = p->cmin_flt;
r.ru_majflt = p->cmaj_flt;
r.ru_nswap = p->cnswap;
@@ -1323,6 +1327,8 @@
default:
jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
jiffies_to_timeval(p->stime + p->cstime, &r.ru_stime);
+ r.ru_nvcsw = p->nvcsw + p->cnvcsw;
+ r.ru_nivcsw = p->nivcsw + p->cnivcsw;
r.ru_minflt = p->min_flt + p->cmin_flt;
r.ru_majflt = p->maj_flt + p->cmaj_flt;
r.ru_nswap = p->nswap + p->cnswap;

2003-08-27 01:15:13

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

Peter Chubb <[email protected]> wrote:
>
> Currently, the context switch counters reported by getrusage() are
> always zero. The appended patch adds fields to struct task_struct to
> count context switches, and adds code to do the counting.
>
> The patch adds 4 longs to struct task struct, and a single addition to
> the fast path in schedule().

OK... Why is this useful? A bit of googling doesn't show much interest in
it.

What apps should be reporting this info? /usr/bin/time?

2003-08-27 01:29:20

by Mike Fedyk

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Tue, Aug 26, 2003 at 06:18:07PM -0700, Andrew Morton wrote:
> Peter Chubb <[email protected]> wrote:
> >
> > Currently, the context switch counters reported by getrusage() are
> > always zero. The appended patch adds fields to struct task_struct to
> > count context switches, and adds code to do the counting.
> >
> > The patch adds 4 longs to struct task struct, and a single addition to
> > the fast path in schedule().
>
> OK... Why is this useful? A bit of googling doesn't show much interest in
> it.
>
> What apps should be reporting this info? /usr/bin/time?

E: Could not open lock file /var/lib/apt/lists/lock - open (13 Permission denied)
E: Unable to lock the list directory
Command exited with non-zero status 100
Command being timed: "apt-get update"
User time (seconds): 0.01
System time (seconds): 0.00
Percent of CPU this job got: 6%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.32

Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0

The averages might be nice...

Maximum resident set size (kbytes): 0

But the maximum would allow any polling app to do its polling less often.
As well as the averages above...

Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 320
Minor (reclaiming a frame) page faults: 21
Voluntary context switches: 0

How can you have voluntary context switches in a preemptive environment?

Involuntary context switches: 0

Swaps: 0

Counting swaps would be nice too.

File system inputs: 0
File system outputs: 0
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0

Yes, yes, yes.

Page size (bytes): 4096
Exit status: 100

One more thing:
$ cat /proc/meminfo
MemTotal: 320628 kB
MemFree: 5148 kB
Buffers: 8316 kB

Where'd shared go, and why didn't rmap start populating this value? It
should be there in the pte-chain lists...

Cached: 127140 kB
SwapCached: 0 kB
Active: 266212 kB
Inactive: 10608 kB
HighTotal: 0 kB
HighFree: 0 kB

Why is high(total|free) there in a non-highmem kernel? If this file were
more dynamic, then we wouldn't have apps that counted on the line number
instead of the first colum's value...

Ok, so that was two more... ;)

2003-08-27 01:50:42

by Peter Chubb

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

>>>>> "Andrew" == Andrew Morton <[email protected]> writes:

Andrew> Peter Chubb <[email protected]> wrote:
>> Currently, the context switch counters reported by getrusage() are
>> always zero. The appended patch adds fields to struct task_struct
>> to count context switches, and adds code to do the counting.
>>
>> The patch adds 4 longs to struct task struct, and a single addition
>> to the fast path in schedule().

Andrew> OK... Why is this useful? A bit of googling doesn't show
Andrew> much interest in it.

/usr/bin/time reports the info, yes.

It's useful for tuning the scheduler, and when developing and tuning
posix thread apps.

I wanted to know if the work I did on adding preemption support to IA64
actually made much difference in the number of involuntary context
switches. It doesn't, at least on the measurements I've made so far.

I'm actually intested in getting most of the rusage fields filled in
properly, at least the ones that make sense for Linux.

Things to do are:
-- Track maxrss and report it.
-- Track and integrate rss.
-- Fix the page fault accounting (currently some minor faults
are counted as major faults)
-- add signal accounting

Block I/O isn't that important -- it almost all goes through the page
cache anyway, and it's a bit difficult to assign a particular I/O to a
particular process. Likewise, message I.O isn't that important AFAIK.

The stack, data and unshared data sizes aren't currently
accounted for separately at all, so it'd be a bit difficult to track
the integral of those numbers.

--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
You are lost in a maze of BitKeeper repositories, all slightly different.

2003-08-27 01:52:59

by Peter Chubb

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

>>>>> "Mike" == Mike Fedyk <[email protected]> writes:

Mike> On Tue, Aug 26, 2003 at 06:18:07PM -0700, Andrew Morton wrote:
>> Peter Chubb <[email protected]> wrote:
>> >
>> > Currently, the context switch counters reported by getrusage()
>> are > always zero. The appended patch adds fields to struct
>> task_struct to > count context switches, and adds code to do the
>> counting.
>> >
>> > The patch adds 4 longs to struct task struct, and a single
>> addition to > the fast path in schedule().
>>
>> OK... Why is this useful? A bit of googling doesn't show much
>> interest in it.
>>
>> What apps should be reporting this info? /usr/bin/time?

Mike> Voluntary context switches: 0

Mike> How can you have voluntary context switches in a preemptive
Mike> environment?

A voluntary context switch is where a task gives up the processor
(e.g., by going to sleep, or by calling sched_yield()).

An involuntary context switch is where a task is preempted by some
other task.

(Another figure-of-merit might be how many times the process is
interrupted by a hardware interrupt)

--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
You are lost in a maze of BitKeeper repositories, all slightly different.

2003-08-27 06:53:29

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 10:57:44AM +1000, Peter Chubb wrote:
> Currently, the context switch counters reported by getrusage() are
> always zero. The appended patch adds fields to struct task_struct to
> count context switches, and adds code to do the counting.
> The patch adds 4 longs to struct task struct, and a single addition to
> the fast path in schedule().

Thanks, this will be useful. We're still missing a fair number of them:

struct rusage {
struct timeval ru_utime; /* user time used */
struct timeval ru_stime; /* system time used */
long ru_maxrss; /* maximum resident set size */
long ru_ixrss; /* integral shared memory size */
long ru_idrss; /* integral unshared data size */
long ru_isrss; /* integral unshared stack size */
long ru_minflt; /* page reclaims */
long ru_majflt; /* page faults */
long ru_nswap; /* swaps */
long ru_inblock; /* block input operations */
long ru_oublock; /* block output operations */
long ru_msgsnd; /* messages sent */
long ru_msgrcv; /* messages received */
long ru_nsignals; /* signals received */
long ru_nvcsw; /* voluntary context switches */
long ru_nivcsw; /* involuntary " */
};

...

case RUSAGE_SELF:
jiffies_to_timeval(p->utime, &r.ru_utime);
jiffies_to_timeval(p->stime, &r.ru_stime);
r.ru_minflt = p->min_flt;
r.ru_majflt = p->maj_flt;
r.ru_nswap = p->nswap;
break;

and we're worse off yet: "FIXME! Get the fault counts properly!" ...
AFAICT literally the only useful number here is utime/stime.

-- wli

P.S.:
The stuff in /proc/$PID/statm isn't a big deal; I've got full 2.4.x
semantics (modulo the VSZ correction) with fully O(1) algorithmic
overhead in some patch originally by bcrl I forward ported somewhere.

2003-08-27 07:15:48

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

Mike, your MUA sucks; you unwittingly removed yourself from Reply-To:

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Command exited with non-zero status 100
> Command being timed: "apt-get update"
> User time (seconds): 0.01
> System time (seconds): 0.00
> Percent of CPU this job got: 6%
> Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.32
> Average shared text size (kbytes): 0
> Average unshared data size (kbytes): 0
> Average stack size (kbytes): 0
> Average total size (kbytes): 0
> The averages might be nice...

The averages themselves aren't reported with getrusage(), only direct
usage measurements. Presumably luserspace computes the averages itself.
i.e. the counters are all for non-average versions of these stats and
(because we're seeing all 0's) are not reported at all.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Maximum resident set size (kbytes): 0
> But the maximum would allow any polling app to do its polling less often.
> As well as the averages above...
> Average resident set size (kbytes): 0
> Major (requiring I/O) page faults: 320
> Minor (reclaiming a frame) page faults: 21

The fault counters are vaguely bogus when threads are involved. There's
a comment alluding to that nearby.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Voluntary context switches: 0
> How can you have voluntary context switches in a preemptive environment?
> Involuntary context switches: 0

Irrelevant to CONFIG_PREEMPT; preemptive multitasking (i.e. userspace can
be preempted) as UNIX has always done is the important issue here.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Swaps: 0
> Counting swaps would be nice too.

This already has two counters in the task_t (no, I will not use Finnish
Hungarian notation in my general posts) that are 100% unused. Probably
the only thing preventing slab poison from showing up there outright is
the whole task_t copy in kernel/fork.c and the bss zeroing for init_task.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> File system inputs: 0
> File system outputs: 0
> Socket messages sent: 0
> Socket messages received: 0
> Signals delivered: 0
> Yes, yes, yes.

These would be easy to set up, they just need counters and the ticking
of the counters dropped in.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Page size (bytes): 4096
> Exit status: 100
> One more thing:
> $ cat /proc/meminfo
> MemTotal: 320628 kB
> MemFree: 5148 kB
> Buffers: 8316 kB
> Where'd shared go, and why didn't rmap start populating this value? It
> should be there in the pte-chain lists...

Shared isn't particularly useful as a single value unqualified by
sharing level.

On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
> Cached: 127140 kB
> SwapCached: 0 kB
> Active: 266212 kB
> Inactive: 10608 kB
> HighTotal: 0 kB
> HighFree: 0 kB
> Why is high(total|free) there in a non-highmem kernel? If this file were
> more dynamic, then we wouldn't have apps that counted on the line number
> instead of the first colum's value...
> Ok, so that was two more... ;)

They could probably very well be omitted; in all likelihood just making
the format more resistant to .config changes to make luserspace's life
easier is a good reason to keep it there.

-- wli

2003-08-27 07:25:36

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> I'm actually intested in getting most of the rusage fields filled in
> properly, at least the ones that make sense for Linux.

Me too.

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> Things to do are:
> -- Track maxrss and report it.

That's easy.

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> -- Track and integrate rss.
> -- Fix the page fault accounting (currently some minor faults
> are counted as major faults)

Hmm, I don't remember this offhand. I thought the bigger issue was with
threads.

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> -- add signal accounting

Sounds easy.

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> Block I/O isn't that important -- it almost all goes through the page
> cache anyway, and it's a bit difficult to assign a particular I/O to a
> particular process. Likewise, message I.O isn't that important AFAIK.

Well, ignoring the background io issue and just ticking per-task counters
in the read/write syscalls sounds good enough to me.

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
> The stack, data and unshared data sizes aren't currently
> accounted for separately at all, so it'd be a bit difficult to track
> the integral of those numbers.

I've got some stuff to keep the derivatives of these going on the back
burner, and integrating isn't hard.

-- wli

2003-08-27 07:39:42

by Peter Chubb

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

>> Command exited with non-zero status 100 Command being timed:
>> "apt-get update" User time (seconds): 0.01 System time (seconds):
>> 0.00 Percent of CPU this job got: 6% Elapsed (wall clock) time
>> (h:mm:ss or m:ss): 0:00.32 Average shared text size (kbytes): 0
>> Average unshared data size (kbytes): 0 Average stack size (kbytes):
>> 0 Average total size (kbytes): 0 The averages might be nice...

William> The averages themselves aren't reported with getrusage(),
William> only direct usage measurements. Presumably luserspace
William> computes the averages itself. i.e. the counters are all for
William> non-average versions of these stats and (because we're seeing
William> all 0's) are not reported at all.

Yes, the kernel is (supposed) to calculate the integral over time of
the memory sizes; user space divides these integrals by elapsed time
to get averages.

To calculate these you need a timestamp for last change, and a set of
counters.
Then code to update all the counters every time one of the sizes
change (otherwise you need a timestamp for each counter) by adding
current_size*(current_time - last_change_time) to each counter.

William> On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
>> Maximum resident set size (kbytes): 0 But the maximum would allow
>> any polling app to do its polling less often. As well as the
>> averages above... Average resident set size (kbytes): 0 Major
>> (requiring I/O) page faults: 320 Minor (reclaiming a frame) page
>> faults: 21

William> The fault counters are vaguely bogus when threads are
William> involved. There's a comment alluding to that nearby.

The fault counters are incorrect anyway --- faults satisfied from the
page cache are counted as major faults, whereas we expect only faults
that sleep for disk I/O to be counted as major faults.

William> On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
>> Swaps: 0 Counting swaps would be nice too.

William> This already has two counters in the task_t (no, I will not
William> use Finnish Hungarian notation in my general posts) that are
William> 100% unused. Probably the only thing preventing slab poison
William> from showing up there outright is the whole task_t copy in
William> kernel/fork.c and the bss zeroing for init_task.

It's unclear what `swaps' are in Linux. Traditionally, this rusage
field was the number of complete swapouts --- I'm not sure what the
equivalent is when processes are not swapped out holus-bolus, but are
paged gradually.

William> On Tue, Aug 26, 2003 at 06:29:14PM -0700, Mike Fedyk wrote:
>> File system inputs: 0 File system outputs: 0 Socket messages sent:
>> 0 Socket messages received: 0 Signals delivered: 0 Yes, yes, yes.

William> These would be easy to set up, they just need counters and
William> the ticking of the counters dropped in.

It's on my list of things to do, if not soon, then I'm hoping for a
summer student to do some of this stuff.

--
Dr Peter Chubb http://www.gelato.unsw.edu.au peterc AT gelato.unsw.edu.au
You are lost in a maze of BitKeeper repositories, all slightly different.

2003-08-27 07:50:45

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 05:39:27PM +1000, Peter Chubb wrote:
> Yes, the kernel is (supposed) to calculate the integral over time of
> the memory sizes; user space divides these integrals by elapsed time
> to get averages.
> To calculate these you need a timestamp for last change, and a set of
> counters.
> Then code to update all the counters every time one of the sizes
> change (otherwise you need a timestamp for each counter) by adding
> current_size*(current_time - last_change_time) to each counter.

At some point after saying the wrong thing I realized this.

William> The fault counters are vaguely bogus when threads are
William> involved. There's a comment alluding to that nearby.

On Wed, Aug 27, 2003 at 05:39:27PM +1000, Peter Chubb wrote:
> The fault counters are incorrect anyway --- faults satisfied from the
> page cache are counted as major faults, whereas we expect only faults
> that sleep for disk I/O to be counted as major faults.

Okay, we can handle that by pushing the counter ticking down far enough
we can actually tell whether io was done or not. In the meantime we're
reporting garbage.

William> This already has two counters in the task_t (no, I will not
William> use Finnish Hungarian notation in my general posts) that are
William> 100% unused. Probably the only thing preventing slab poison
William> from showing up there outright is the whole task_t copy in
William> kernel/fork.c and the bss zeroing for init_task.

On Wed, Aug 27, 2003 at 05:39:27PM +1000, Peter Chubb wrote:
> It's unclear what `swaps' are in Linux. Traditionally, this rusage
> field was the number of complete swapouts --- I'm not sure what the
> equivalent is when processes are not swapped out holus-bolus, but are
> paged gradually.

We don't have load control yet; the counters should probably be removed
until we do.

-- wli

2003-08-27 08:09:54

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 05:39:27PM +1000, Peter Chubb wrote:
> To calculate these you need a timestamp for last change, and a set of
> counters.
> Then code to update all the counters every time one of the sizes
> change (otherwise you need a timestamp for each counter) by adding
> current_size*(current_time - last_change_time) to each counter.

Hmm. Building a tiny integrated counter ADT sounds useful; I can just
park it on something easy like RSS and we can use it to crush the rest
when there's enough machinery to keep the non-integrated counters.

-- wli

2003-08-27 14:41:05

by bert hubert

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Tue, Aug 26, 2003 at 06:18:07PM -0700, Andrew Morton wrote:

> > always zero. The appended patch adds fields to struct task_struct to
> > count context switches, and adds code to do the counting.

> OK... Why is this useful? A bit of googling doesn't show much interest in
> it.

I'm unaware of the cost of accounting this, but I for one have had
occasions where my system was reporting 20kcs, and I had no clue which
process was causing it. After a while I learned that linuxthreads on SMP can
cause this.

> What apps should be reporting this info? /usr/bin/time?

/proc/$$/stat would also be nice.

Thanks.

--
http://www.PowerDNS.com Open source, database driven DNS Software
http://lartc.org Linux Advanced Routing & Traffic Control HOWTO

2003-08-27 15:53:11

by Larry McVoy

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

I normally hate ifdefs but this might be a good place to use a bunch of
macros and make them conditional on config_stats or something. Updating
counters is going to add to the size of the data cache footprint and it
would be nice, for those people working on embedded low speed processors,
if they could config this out. I personally would leave it in, I like
this stats. I just know that the path to slowness is paved one cache
miss at a time.

On Tue, Aug 26, 2003 at 11:54:35PM -0700, William Lee Irwin III wrote:
> On Wed, Aug 27, 2003 at 10:57:44AM +1000, Peter Chubb wrote:
> > Currently, the context switch counters reported by getrusage() are
> > always zero. The appended patch adds fields to struct task_struct to
> > count context switches, and adds code to do the counting.
> > The patch adds 4 longs to struct task struct, and a single addition to
> > the fast path in schedule().
>
> Thanks, this will be useful. We're still missing a fair number of them:
>
> struct rusage {
> struct timeval ru_utime; /* user time used */
> struct timeval ru_stime; /* system time used */
> long ru_maxrss; /* maximum resident set size */
> long ru_ixrss; /* integral shared memory size */
> long ru_idrss; /* integral unshared data size */
> long ru_isrss; /* integral unshared stack size */
> long ru_minflt; /* page reclaims */
> long ru_majflt; /* page faults */
> long ru_nswap; /* swaps */
> long ru_inblock; /* block input operations */
> long ru_oublock; /* block output operations */
> long ru_msgsnd; /* messages sent */
> long ru_msgrcv; /* messages received */
> long ru_nsignals; /* signals received */
> long ru_nvcsw; /* voluntary context switches */
> long ru_nivcsw; /* involuntary " */
> };
--
---
Larry McVoy lm at bitmover.com http://www.bitmover.com/lm

2003-08-27 16:02:38

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 08:52:46AM -0700, Larry McVoy wrote:
> I normally hate ifdefs but this might be a good place to use a bunch of
> macros and make them conditional on config_stats or something. Updating
> counters is going to add to the size of the data cache footprint and it
> would be nice, for those people working on embedded low speed processors,
> if they could config this out. I personally would leave it in, I like
> this stats. I just know that the path to slowness is paved one cache
> miss at a time.

I've profiled this and know the memory stats don't do any harm; the
rest I'd have to see profiled. AFAICT all the damage is done after
ticking mm->rss in the various pagetable copying/blitting operations,
and once we've taken that hit (in mainline!) the other counters are
noise-level. The integral counters are another story; I've not seen
those in action.

-- wli

2003-08-27 16:11:43

by Larry McVoy

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 09:01:33AM -0700, William Lee Irwin III wrote:
> On Wed, Aug 27, 2003 at 08:52:46AM -0700, Larry McVoy wrote:
> > I normally hate ifdefs but this might be a good place to use a bunch of
> > macros and make them conditional on config_stats or something. Updating
> > counters is going to add to the size of the data cache footprint and it
> > would be nice, for those people working on embedded low speed processors,
> > if they could config this out. I personally would leave it in, I like
> > this stats. I just know that the path to slowness is paved one cache
> > miss at a time.
>
> I've profiled this and know the memory stats don't do any harm; the
> rest I'd have to see profiled. AFAICT all the damage is done after
> ticking mm->rss in the various pagetable copying/blitting operations,
> and once we've taken that hit (in mainline!) the other counters are
> noise-level. The integral counters are another story; I've not seen
> those in action.

This is the classic response that I get whenever I raise this sort of
concern. I got it at Sun, I got it at SGI. Everyone says "my change
made no difference". And they are right from one point of view: you
run some micro benchmark and you can't see any difference.

Of course you can't see any difference, in the microbenchmark everything
is in the cache. But you did increase the amount of cache usage.
Consider a real world case where the application and the kernel now
just exactly fit in the caches for the critical loop. Adding one
extra cache line will hurt that application but would never be seen in
a microbenchmark.

The only way to really measure this is with real work loads and a cache
miss counter. And even that won't always show up because if the work load
you choose happened to only use 1/2 of the data cache (for instance) you
need to add enough more than 1/2 of the cache lines to show up in the
results.

Think of it this way: we can add N extra cache lines and see no
difference. Then we add the Nth+1 and all of a sudden things get slow.
Is that the fault of the Nth+1 guy? Nope. It's the fault of all N,
the Nth+1 guy just had bad timing, he should have gotten his change
in earlier.

I realize that I'm being extreme here but if I can get this point across
that's a good thing. I'm convinced that it was a lack of understanding
of this point that lead to the bloated commercial operating systems.
Linux needs to stay fast. Processors have cycle times of a third of a
nanosecond yet memory is still ~130ns away.
--
---
Larry McVoy lm at bitmover.com http://www.bitmover.com/lm

2003-08-27 17:56:46

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 09:09:39AM -0700, Larry McVoy wrote:
> This is the classic response that I get whenever I raise this sort of
> concern. I got it at Sun, I got it at SGI. Everyone says "my change
> made no difference". And they are right from one point of view: you
> run some micro benchmark and you can't see any difference.
> Of course you can't see any difference, in the microbenchmark everything
> is in the cache. But you did increase the amount of cache usage.
> Consider a real world case where the application and the kernel now
> just exactly fit in the caches for the critical loop. Adding one
> extra cache line will hurt that application but would never be seen in
> a microbenchmark.

I used a macrobenchmark for this measurement with instruction-level
profiling on cache misses, TLB misses, and cpu cycles.

An unusual result of this was that with respect to cpu cycles, the
most costly operation in the entire kernel after bitblitting userspace
memory was rounding the stack pointer to find current_thread_info();
that is, it was #3, behind only copy_to_user_ll()/copy_from_user_ll().

On Wed, Aug 27, 2003 at 09:09:39AM -0700, Larry McVoy wrote:
> The only way to really measure this is with real work loads and a cache
> miss counter. And even that won't always show up because if the work load
> you choose happened to only use 1/2 of the data cache (for instance) you
> need to add enough more than 1/2 of the cache lines to show up in the
> results.

I already used the cache miss counter. Seeing mm->rss take numerous
cache misses in the loop of copy_page_range() (in mainline!) seemed
unusual. A vaguely plausible explanation (guesswork is required without
an ITP/ICE or a sufficiently useful simulator) is that the pagetable
bitblitting evicted it from the cache, despite my _very_ intense
efforts to reduce the amount of pagetable bitblitting via cacheing.
An alternative explanation is that the off-node access to slab memory
took such large remote access penalties when it did have cache misses
that even a low miss rate elevated it to the the top of the profile.

On Wed, Aug 27, 2003 at 09:09:39AM -0700, Larry McVoy wrote:
> Think of it this way: we can add N extra cache lines and see no
> difference. Then we add the Nth+1 and all of a sudden things get slow.
> Is that the fault of the Nth+1 guy? Nope. It's the fault of all N,
> the Nth+1 guy just had bad timing, he should have gotten his change
> in earlier.
> I realize that I'm being extreme here but if I can get this point across
> that's a good thing. I'm convinced that it was a lack of understanding
> of this point that lead to the bloated commercial operating systems.
> Linux needs to stay fast. Processors have cycle times of a third of a
> nanosecond yet memory is still ~130ns away.

This is not lost on me (and I'm in fact pushing other cache preservation
code very hard; c.f. pagetable cacheing discussions and the soon to be
sent bottom-level pagetable cacheing code in -wli). The fact of the
matter is that we lose a cacheline at a time, and if we've already lost
one to mm->rss, we should utilize the rest of it for whatever other
counters are prudent instead of wasting the rest of it.

A number of the rest of these counters are very infrequently updated;
IMHO such things as nswaps (whenever we get load control, which we seem
to be getting various complaints about lacking) and signal counts are
updated rarely enough to ignore the effects of.

-- wli

2003-08-28 16:55:07

by Mike Fedyk

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 12:51:43AM -0700, William Lee Irwin III wrote:
> On Wed, Aug 27, 2003 at 05:39:27PM +1000, Peter Chubb wrote:
> > It's unclear what `swaps' are in Linux. Traditionally, this rusage
> > field was the number of complete swapouts --- I'm not sure what the
> > equivalent is when processes are not swapped out holus-bolus, but are
> > paged gradually.
>
> We don't have load control yet; the counters should probably be removed
> until we do.

Why not just count the number of pages swapped in/out per process? I'm sure
that would be helpful for VM tools polling for stats from userspace... And
even in the development of load control.

2003-08-28 17:08:36

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Thu, Aug 28, 2003 at 09:55:04AM -0700, Mike Fedyk wrote:
[...]

Fix your MUA to put you on the Reply-To: line as you should be. This is
beyond violating the principle of least surprise and into the realm of
a serious communication barrier.

-- wli

2003-08-28 17:06:38

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 12:51:43AM -0700, William Lee Irwin III wrote:
>> We don't have load control yet; the counters should probably be removed
>> until we do.

On Thu, Aug 28, 2003 at 09:55:04AM -0700, Mike Fedyk wrote:
> Why not just count the number of pages swapped in/out per process? I'm sure
> that would be helpful for VM tools polling for stats from userspace... And
> even in the development of load control.

That's good to report, sure; however, that would violate the semantics
of getrusage(), whose nswap refers to whole-process swapping done for
the purposes of reducing the multiprogramming level (i.e. load control).

-- wli

2003-08-28 17:48:02

by Mike Fedyk

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Thu, Aug 28, 2003 at 10:09:47AM -0700, William Lee Irwin III wrote:
> On Thu, Aug 28, 2003 at 09:55:04AM -0700, Mike Fedyk wrote:
> [...]
>
> Fix your MUA to put you on the Reply-To: line as you should be. This is
> beyond violating the principle of least surprise and into the realm of
> a serious communication barrier.

I am on the kernel list, and if I am in the reply-to or not, it goes in the
same folder.

I will see the message either way.

Thanks

2003-11-18 01:19:25

by William Lee Irwin III

[permalink] [raw]

Subject: Re: [PATCH] 2.6.0-test4 -- add context switch counters

On Wed, Aug 27, 2003 at 11:50:30AM +1000, Peter Chubb wrote:
>> -- Track and integrate rss.
>> -- Fix the page fault accounting (currently some minor faults
>> are counted as major faults)

On Wed, Aug 27, 2003 at 12:26:33AM -0700, William Lee Irwin III wrote:
> Hmm, I don't remember this offhand. I thought the bigger issue was with
> threads.

How does this look? I probably did the bull in the china shop thing
again, but I can just take feedback and clean things up as-needed.

-- wli

===== Documentation/filesystems/Locking 1.45 vs edited =====
--- 1.45/Documentation/filesystems/Locking Wed Aug 20 22:31:59 2003
+++ edited/Documentation/filesystems/Locking Mon Oct 13 15:13:40 2003
@@ -420,7 +420,7 @@
prototypes:
void (*open)(struct vm_area_struct*);
void (*close)(struct vm_area_struct*);
- struct page *(*nopage)(struct vm_area_struct*, unsigned long, int);
+ struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);

locking rules:
BKL mmap_sem
===== arch/i386/mm/hugetlbpage.c 1.38 vs edited =====
--- 1.38/arch/i386/mm/hugetlbpage.c Tue Sep 23 23:15:29 2003
+++ edited/arch/i386/mm/hugetlbpage.c Mon Oct 13 15:14:10 2003
@@ -529,7 +529,7 @@
* this far.
*/
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
- unsigned long address, int unused)
+ unsigned long address, int *unused)
{
BUG();
return NULL;
===== arch/ia64/ia32/binfmt_elf32.c 1.15 vs edited =====
--- 1.15/arch/ia64/ia32/binfmt_elf32.c Mon Jul 21 07:39:59 2003
+++ edited/arch/ia64/ia32/binfmt_elf32.c Mon Oct 13 15:15:42 2003
@@ -60,11 +60,13 @@
extern unsigned long *ia32_gdt;

struct page *
-ia32_install_shared_page (struct vm_area_struct *vma, unsigned long address, int no_share)
+ia32_install_shared_page (struct vm_area_struct *vma, unsigned long address, int *type)
{
struct page *pg = ia32_shared_page[(address - vma->vm_start)/PAGE_SIZE];

get_page(pg);
+ if (type)
+ *type = VM_FAULT_MINOR;
return pg;
}

===== arch/ia64/mm/hugetlbpage.c 1.15 vs edited =====
--- 1.15/arch/ia64/mm/hugetlbpage.c Thu Oct 9 16:09:37 2003
+++ edited/arch/ia64/mm/hugetlbpage.c Mon Oct 13 15:15:53 2003
@@ -518,7 +518,7 @@
return 1;
}

-static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused)
{
BUG();
return NULL;
===== arch/ppc64/mm/hugetlbpage.c 1.2 vs edited =====
--- 1.2/arch/ppc64/mm/hugetlbpage.c Sat Sep 6 18:40:37 2003
+++ edited/arch/ppc64/mm/hugetlbpage.c Mon Oct 13 16:16:04 2003
@@ -914,7 +914,7 @@
* this far.
*/
static struct page *hugetlb_nopage(struct vm_area_struct *vma,
- unsigned long address, int unused)
+ unsigned long address, int *unused)
{
BUG();
return NULL;
===== arch/sparc64/mm/hugetlbpage.c 1.8 vs edited =====
--- 1.8/arch/sparc64/mm/hugetlbpage.c Tue Aug 26 09:41:27 2003
+++ edited/arch/sparc64/mm/hugetlbpage.c Mon Oct 13 15:16:38 2003
@@ -433,7 +433,7 @@
}

static struct page *
-hugetlb_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
+hugetlb_nopage(struct vm_area_struct *vma, unsigned long address, int *unused)
{
BUG();
return NULL;
===== drivers/char/agp/alpha-agp.c 1.9 vs edited =====
--- 1.9/drivers/char/agp/alpha-agp.c Mon Sep 15 17:03:51 2003
+++ edited/drivers/char/agp/alpha-agp.c Mon Oct 13 15:17:14 2003
@@ -13,7 +13,7 @@

static struct page *alpha_core_agp_vm_nopage(struct vm_area_struct *vma,
unsigned long address,
- int write_access)
+ int *type)
{
alpha_agp_info *agp = agp_bridge->dev_private_data;
dma_addr_t dma_addr;
@@ -30,6 +30,8 @@
*/
page = virt_to_page(__va(pa));
get_page(page);
+ if (type)
+ *type = VM_FAULT_MINOR;
return page;
}

===== drivers/char/drm/drmP.h 1.29 vs edited =====
--- 1.29/drivers/char/drm/drmP.h Thu Sep 25 08:56:58 2003
+++ edited/drivers/char/drm/drmP.h Mon Oct 13 15:17:38 2003
@@ -760,16 +760,16 @@
/* Mapping support (drm_vm.h) */
extern struct page *DRM(vm_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access);
+ int *type);
extern struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access);
+ int *type);
extern struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access);
+ int *type);
extern struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access);
+ int *type);
extern void DRM(vm_open)(struct vm_area_struct *vma);
extern void DRM(vm_close)(struct vm_area_struct *vma);
extern void DRM(vm_shm_close)(struct vm_area_struct *vma);
===== drivers/char/drm/drm_vm.h 1.25 vs edited =====
--- 1.25/drivers/char/drm/drm_vm.h Thu Jul 10 23:18:01 2003
+++ edited/drivers/char/drm/drm_vm.h Mon Oct 13 16:48:08 2003
@@ -76,7 +76,7 @@
*/
struct page *DRM(vm_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access)
+ int *type)
{
#if __REALLY_HAVE_AGP
drm_file_t *priv = vma->vm_file->private_data;
@@ -133,6 +133,8 @@
baddr, __va(agpmem->memory->memory[offset]), offset,
atomic_read(&page->count));

+ if (type)
+ *type = VM_FAULT_MINOR;
return page;
}
vm_nopage_error:
@@ -154,7 +156,7 @@
*/
struct page *DRM(vm_shm_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access)
+ int *type)
{
drm_map_t *map = (drm_map_t *)vma->vm_private_data;
unsigned long offset;
@@ -170,6 +172,8 @@
if (!page)
return NOPAGE_OOM;
get_page(page);
+ if (type)
+ *type = VM_FAULT_MINOR;

DRM_DEBUG("shm_nopage 0x%lx\n", address);
return page;
@@ -268,7 +272,7 @@
*/
struct page *DRM(vm_dma_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access)
+ int *type)
{
drm_file_t *priv = vma->vm_file->private_data;
drm_device_t *dev = priv->dev;
@@ -287,6 +291,8 @@
(offset & (~PAGE_MASK))));

get_page(page);
+ if (type)
+ *type = VM_FAULT_MINOR;

DRM_DEBUG("dma_nopage 0x%lx (page %lu)\n", address, page_nr);
return page;
@@ -304,7 +310,7 @@
*/
struct page *DRM(vm_sg_nopage)(struct vm_area_struct *vma,
unsigned long address,
- int write_access)
+ int *type)
{
drm_map_t *map = (drm_map_t *)vma->vm_private_data;
drm_file_t *priv = vma->vm_file->private_data;
@@ -325,6 +331,8 @@
page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT);
page = entry->pagelist[page_offset];
get_page(page);
+ if (type)
+ *type = VM_FAULT_MINOR;

return page;
}
===== drivers/ieee1394/dma.c 1.5 vs edited =====
--- 1.5/drivers/ieee1394/dma.c Thu Jul 24 17:00:00 2003
+++ edited/drivers/ieee1394/dma.c Mon Oct 13 15:19:12 2003
@@ -187,7 +187,7 @@
/* nopage() handler for mmap access */

static struct page*
-dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int write_access)
+dma_region_pagefault(struct vm_area_struct *area, unsigned long address, int *type)
{
unsigned long offset;
unsigned long kernel_virt_addr;
@@ -202,6 +202,8 @@
(address > (unsigned long) area->vm_start + (PAGE_SIZE * dma->n_pages)) )
goto out;

+ if (type)
+ *type = VM_FAULT_MINOR;
offset = address - area->vm_start;
kernel_virt_addr = (unsigned long) dma->kvirt + offset;
ret = vmalloc_to_page((void*) kernel_virt_addr);
===== drivers/media/video/video-buf.c 1.11 vs edited =====
--- 1.11/drivers/media/video/video-buf.c Mon Oct 6 08:48:02 2003
+++ edited/drivers/media/video/video-buf.c Mon Oct 13 15:19:52 2003
@@ -1076,7 +1076,7 @@
*/
static struct page*
videobuf_vm_nopage(struct vm_area_struct *vma, unsigned long vaddr,
- int write_access)
+ int *type)
{
struct page *page;

@@ -1088,6 +1088,8 @@
if (!page)
return NOPAGE_OOM;
clear_user_page(page_address(page), vaddr, page);
+ if (type)
+ *type = VM_FAULT_MINOR;
return page;
}

===== drivers/scsi/sg.c 1.69 vs edited =====
--- 1.69/drivers/scsi/sg.c Sat Sep 20 02:35:07 2003
+++ edited/drivers/scsi/sg.c Mon Oct 13 15:21:16 2003
@@ -1115,7 +1115,7 @@
}

static struct page *
-sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int unused)
+sg_vma_nopage(struct vm_area_struct *vma, unsigned long addr, int *type)
{
Sg_fd *sfp;
struct page *page = NOPAGE_SIGBUS;
@@ -1155,6 +1155,8 @@
page = virt_to_page(page_ptr);
get_page(page); /* increment page count */
}
+ if (type)
+ *type = VM_FAULT_MINOR;
return page;
}

===== fs/ncpfs/mmap.c 1.7 vs edited =====
--- 1.7/fs/ncpfs/mmap.c Sat Aug 10 20:07:55 2002
+++ edited/fs/ncpfs/mmap.c Mon Oct 13 16:23:56 2003
@@ -26,7 +26,7 @@
* Fill in the supplied page for mmap
*/
static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
- unsigned long address, int write_access)
+ unsigned long address, int *type)
{
struct file *file = area->vm_file;
struct dentry *dentry = file->f_dentry;
@@ -85,6 +85,15 @@
memset(pg_addr + already_read, 0, PAGE_SIZE - already_read);
flush_dcache_page(page);
kunmap(page);
+
+ /*
+ * If I understand ncp_read_kernel() properly, the above always
+ * fetches from the network, here the analogue of disk.
+ * -- wli
+ */
+ if (type)
+ *type = VM_FAULT_MAJOR;
+ inc_page_state(pgmajfault);
return page;
}

===== include/linux/mm.h 1.133 vs edited =====
--- 1.133/include/linux/mm.h Sun Oct 5 01:07:49 2003
+++ edited/include/linux/mm.h Mon Oct 13 16:31:00 2003
@@ -143,7 +143,7 @@
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
- struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
+ struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
};

@@ -405,7 +405,7 @@
extern void show_free_areas(void);

struct page *shmem_nopage(struct vm_area_struct * vma,
- unsigned long address, int unused);
+ unsigned long address, int *type);
struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
void shmem_lock(struct file * file, int lock);
int shmem_zero_setup(struct vm_area_struct *);
@@ -563,7 +563,7 @@
extern void truncate_inode_pages(struct address_space *, loff_t);

/* generic vm_area_ops exported for stackable file systems */
-extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
+struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);

/* mm/page-writeback.c */
int write_one_page(struct page *page, int wait);
===== kernel/sys.c 1.66 vs edited =====
--- 1.66/kernel/sys.c Thu Oct 9 15:13:54 2003
+++ edited/kernel/sys.c Mon Oct 13 16:02:41 2003
@@ -1325,8 +1325,6 @@
* either stopped or zombied. In the zombied case the task won't get
* reaped till shortly after the call to getrusage(), in both cases the
* task being examined is in a frozen state so the counters won't change.
- *
- * FIXME! Get the fault counts properly!
*/
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
===== mm/filemap.c 1.210 vs edited =====
--- 1.210/mm/filemap.c Tue Oct 7 19:53:43 2003
+++ edited/mm/filemap.c Mon Oct 13 16:25:46 2003
@@ -984,7 +984,7 @@
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*/
-struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
+struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int *type)
{
int error;
struct file *file = area->vm_file;
@@ -993,7 +993,7 @@
struct inode *inode = mapping->host;
struct page *page;
unsigned long size, pgoff, endoff;
- int did_readaround = 0;
+ int did_readaround = 0, majmin = VM_FAULT_MINOR;

pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
@@ -1042,6 +1042,14 @@
if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
goto no_cached_page;

+ /*
+ * To keep the pgmajfault counter straight, we need to
+ * check did_readaround, as this is an inner loop.
+ */
+ if (!did_readaround) {
+ majmin = VM_FAULT_MAJOR;
+ inc_page_state(pgmajfault);
+ }
did_readaround = 1;
do_page_cache_readahead(mapping, file,
pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
@@ -1063,6 +1071,8 @@
* Found the page and have a reference on it.
*/
mark_page_accessed(page);
+ if (type)
+ *type = majmin;
return page;

outside_data_content:
@@ -1098,7 +1108,10 @@
return NULL;

page_not_uptodate:
- inc_page_state(pgmajfault);
+ if (!did_readaround) {
+ majmin = VM_FAULT_MAJOR;
+ inc_page_state(pgmajfault);
+ }
lock_page(page);

/* Did it get unhashed while we waited for it? */
===== mm/memory.c 1.139 vs edited =====
--- 1.139/mm/memory.c Wed Oct 8 08:59:27 2003
+++ edited/mm/memory.c Mon Oct 13 15:44:10 2003
@@ -1416,7 +1416,7 @@
}
smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */
retry:
- new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
+ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);

/* no page was available -- either SIGBUS or OOM */
if (new_page == NOPAGE_SIGBUS)
@@ -1485,14 +1485,12 @@
pte_unmap(page_table);
page_cache_release(new_page);
spin_unlock(&mm->page_table_lock);
- ret = VM_FAULT_MINOR;
goto out;
}

/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
spin_unlock(&mm->page_table_lock);
- ret = VM_FAULT_MAJOR;
goto out;
oom:
ret = VM_FAULT_OOM;
===== mm/shmem.c 1.135 vs edited =====
--- 1.135/mm/shmem.c Tue Sep 9 23:41:41 2003
+++ edited/mm/shmem.c Mon Oct 13 16:30:00 2003
@@ -67,7 +67,7 @@
};

static int shmem_getpage(struct inode *inode, unsigned long idx,
- struct page **pagep, enum sgp_type sgp);
+ struct page **pagep, enum sgp_type sgp, int *type);

static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
{
@@ -522,7 +522,7 @@
if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
(void) shmem_getpage(inode,
attr->ia_size>>PAGE_CACHE_SHIFT,
- &page, SGP_READ);
+ &page, SGP_READ, NULL);
}
}
}
@@ -734,7 +734,7 @@
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache
*/
-static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp)
+static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp, int *type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -743,7 +743,7 @@
struct page *swappage;
swp_entry_t *entry;
swp_entry_t swap;
- int error;
+ int error, majmin = VM_FAULT_MINOR;

if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
@@ -780,6 +780,10 @@
if (!swappage) {
shmem_swp_unmap(entry);
spin_unlock(&info->lock);
+ /* here we actually do the io */
+ if (majmin == VM_FAULT_MINOR && type)
+ inc_page_state(pgmajfault);
+ majmin = VM_FAULT_MAJOR;
swapin_readahead(swap);
swappage = read_swap_cache_async(swap);
if (!swappage) {
@@ -926,6 +930,8 @@
} else
*pagep = ZERO_PAGE(0);
}
+ if (type)
+ *type = majmin;
return 0;

failed:
@@ -936,7 +942,7 @@
return error;
}

-struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
+struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int *type)
{
struct inode *inode = vma->vm_file->f_dentry->d_inode;
struct page *page = NULL;
@@ -947,7 +953,7 @@
idx += vma->vm_pgoff;
idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;

- error = shmem_getpage(inode, idx, &page, SGP_CACHE);
+ error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
if (error)
return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;

@@ -974,7 +980,7 @@
/*
* Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
*/
- err = shmem_getpage(inode, pgoff, &page, sgp);
+ err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
if (err)
return err;
if (page) {
@@ -1124,7 +1130,7 @@
shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
{
struct inode *inode = page->mapping->host;
- return shmem_getpage(inode, page->index, &page, SGP_WRITE);
+ return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
}

static ssize_t
@@ -1181,7 +1187,7 @@
* But it still may be a good idea to prefault below.
*/

- err = shmem_getpage(inode, index, &page, SGP_WRITE);
+ err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
if (err)
break;

@@ -1264,7 +1270,7 @@
break;
}

- desc->error = shmem_getpage(inode, index, &page, SGP_READ);
+ desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
if (desc->error) {
if (desc->error == -EINVAL)
desc->error = 0;
@@ -1515,7 +1521,7 @@
iput(inode);
return -ENOMEM;
}
- error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
vm_unacct_memory(VM_ACCT(1));
iput(inode);
@@ -1551,7 +1557,7 @@
static int shmem_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct page *page = NULL;
- int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
+ int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
if (res)
return res;
res = vfs_readlink(dentry, buffer, buflen, kmap(page));
@@ -1564,7 +1570,7 @@
static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct page *page = NULL;
- int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
+ int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
if (res)
return res;
res = vfs_follow_link(nd, kmap(page));
===== sound/core/pcm_native.c 1.41 vs edited =====
--- 1.41/sound/core/pcm_native.c Mon Sep 29 19:28:26 2003
+++ edited/sound/core/pcm_native.c Mon Oct 13 15:44:41 2003
@@ -2779,7 +2779,7 @@
return mask;
}

-static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, unsigned long address, int no_share)
+static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, unsigned long address, int *type)
{
snd_pcm_substream_t *substream = (snd_pcm_substream_t *)area->vm_private_data;
snd_pcm_runtime_t *runtime;
@@ -2791,6 +2791,8 @@
page = virt_to_page(runtime->status);
if (!PageReserved(page))
get_page(page);
+ if (type)
+ *type = VM_FAULT_MINOR;
return page;
}

===== sound/oss/via82cxxx_audio.c 1.34 vs edited =====
--- 1.34/sound/oss/via82cxxx_audio.c Sun Oct 5 01:07:55 2003
+++ edited/sound/oss/via82cxxx_audio.c Mon Oct 13 15:53:34 2003
@@ -2116,7 +2116,7 @@

static struct page * via_mm_nopage (struct vm_area_struct * vma,
- unsigned long address, int write_access)
+ unsigned long address, int *type)
{
struct via_info *card = vma->vm_private_data;
struct via_channel *chan = &card->ch_out;
@@ -2124,12 +2124,11 @@
unsigned long pgoff;
int rd, wr;

- DPRINTK ("ENTER, start %lXh, ofs %lXh, pgoff %ld, addr %lXh, wr %d\n",
+ DPRINTK ("ENTER, start %lXh, ofs %lXh, pgoff %ld, addr %lXh\n",
vma->vm_start,
address - vma->vm_start,
(address - vma->vm_start) >> PAGE_SHIFT,
- address,
- write_access);
+ address);

if (address > vma->vm_end) {
DPRINTK ("EXIT, returning NOPAGE_SIGBUS\n");
@@ -2167,6 +2166,8 @@
DPRINTK ("EXIT, returning page %p for cpuaddr %lXh\n",
dmapage, (unsigned long) chan->pgtbl[pgoff].cpuaddr);
get_page (dmapage);
+ if (type)
+ *type = VM_FAULT_MINOR;
return dmapage;
}

===== sound/oss/emu10k1/audio.c 1.19 vs edited =====
--- 1.19/sound/oss/emu10k1/audio.c Tue Aug 26 09:25:41 2003
+++ edited/sound/oss/emu10k1/audio.c Mon Oct 13 15:52:23 2003
@@ -989,7 +989,7 @@
return 0;
}

-static struct page *emu10k1_mm_nopage (struct vm_area_struct * vma, unsigned long address, int write_access)
+static struct page *emu10k1_mm_nopage (struct vm_area_struct * vma, unsigned long address, int *type)
{
struct emu10k1_wavedevice *wave_dev = vma->vm_private_data;
struct woinst *woinst = wave_dev->woinst;
@@ -1032,6 +1032,8 @@
get_page (dmapage);

DPD(3, "page: %#lx\n", (unsigned long) dmapage);
+ if (type)
+ *type = VM_FAULT_MINOR;
return dmapage;
}