The current swap prefetching implementation is far too aggressive to the point
of its cpu and disk access being noticed. This patch addresses that issue.
Andrew please apply this one and keep ignoring the yield patch the way you
rightly already were.
Cheers,
Con
---
Swap prefetch tweaks.
Add watermarks to swap prefetching, and prefetch when free memory is greater
than pages_high * 4 down to pages_high * 3.
Check cpu load and only prefetch when kprefetchd is the only process running.
Testing cpu load of just the cpu that kprefetchd is currently running on is
not enough to ensure that kprefetchd working does not consume resources in a
noticeable way on SMP.
Clear the busy bit only if it is set.
Signed-off-by: Con Kolivas <[email protected]>
---
mm/swap_prefetch.c | 154 ++++++++++++++++++++++++++++++++++++++++-------------
1 files changed, 118 insertions(+), 36 deletions(-)
Index: linux-2.6.16-rc5-mm3/mm/swap_prefetch.c
===================================================================
--- linux-2.6.16-rc5-mm3.orig/mm/swap_prefetch.c 2006-03-10 15:29:11.000000000 +1100
+++ linux-2.6.16-rc5-mm3/mm/swap_prefetch.c 2006-03-10 20:36:56.000000000 +1100
@@ -150,21 +150,31 @@ enum trickle_return {
TRICKLE_DELAY,
};
+struct node_stats {
+ unsigned long last_free;
+ /* Free ram after a cycle of prefetching */
+ unsigned long current_free;
+ /* Free ram on this cycle of checking prefetch_suitable */
+ unsigned long prefetch_watermark;
+ /* Maximum amount we will prefetch to */
+ unsigned long highfree[MAX_NR_ZONES];
+ /* The amount of free ram before we start prefetching */
+ unsigned long lowfree[MAX_NR_ZONES];
+ /* The amount of free ram where we will stop prefetching */
+ unsigned long *pointfree[MAX_NR_ZONES];
+ /* highfree or lowfree depending on whether we've hit a watermark */
+};
+
/*
* prefetch_stats stores the free ram data of each node and this is used to
* determine if a node is suitable for prefetching into.
*/
-struct prefetch_stats{
- unsigned long last_free[MAX_NUMNODES];
- /* Free ram after a cycle of prefetching */
- unsigned long current_free[MAX_NUMNODES];
- /* Free ram on this cycle of checking prefetch_suitable */
- unsigned long prefetch_watermark[MAX_NUMNODES];
- /* Maximum amount we will prefetch to */
+struct prefetch_stats {
nodemask_t prefetch_nodes;
/* Which nodes are currently suited to prefetching */
unsigned long prefetched_pages;
/* Total pages we've prefetched on this wakeup of kprefetchd */
+ struct node_stats node[MAX_NUMNODES];
};
static struct prefetch_stats sp_stat;
@@ -211,7 +221,7 @@ static enum trickle_return trickle_swap_
}
sp_stat.prefetched_pages++;
- sp_stat.last_free[node]--;
+ sp_stat.node[node].last_free--;
ret = TRICKLE_SUCCESS;
out_release:
@@ -229,8 +239,11 @@ static void clear_last_prefetch_free(voi
* update the data to take into account memory hotplug if desired..
*/
sp_stat.prefetch_nodes = node_online_map;
- for_each_node_mask(node, sp_stat.prefetch_nodes)
- sp_stat.last_free[node] = 0;
+ for_each_node_mask(node, sp_stat.prefetch_nodes) {
+ struct node_stats *ns = &sp_stat.node[node];
+
+ ns->last_free = 0;
+ }
}
static void clear_current_prefetch_free(void)
@@ -238,8 +251,43 @@ static void clear_current_prefetch_free(
int node;
sp_stat.prefetch_nodes = node_online_map;
- for_each_node_mask(node, sp_stat.prefetch_nodes)
- sp_stat.current_free[node] = 0;
+ for_each_node_mask(node, sp_stat.prefetch_nodes) {
+ struct node_stats *ns = &sp_stat.node[node];
+
+ ns->current_free = 0;
+ }
+}
+
+/*
+ * This updates the high and low watermarks of amount of free ram in each
+ * node used to start and stop prefetching. We prefetch from pages_high * 4
+ * down to pages_high * 3.
+ */
+static void examine_free_limits(void)
+{
+ struct zone *z;
+
+ for_each_zone(z) {
+ struct node_stats *ns;
+ int idx;
+
+ if (!populated_zone(z))
+ continue;
+
+ ns = &sp_stat.node[z->zone_pgdat->node_id];
+ idx = zone_idx(z);
+ ns->lowfree[idx] = z->pages_high * 3 + z->lowmem_reserve[idx];
+ ns->highfree[idx] = ns->lowfree[idx] + z->pages_high;
+
+ if (z->free_pages > ns->highfree[idx]) {
+ /*
+ * We've gotten above the high watermark of free pages
+ * so we can start prefetching till we get to the low
+ * watermark.
+ */
+ ns->pointfree[idx] = &ns->lowfree[idx];
+ }
+ }
}
/*
@@ -247,14 +295,34 @@ static void clear_current_prefetch_free(
*/
static int prefetch_suitable(void)
{
- struct page_state ps;
unsigned long limit;
struct zone *z;
- int node, ret = 0;
+ int node, ret = 0, test_pagestate = 0;
- /* Purposefully racy and might return false positive which is ok */
- if (__test_and_clear_bit(0, &swapped.busy))
+ /* Purposefully racy */
+ if (test_bit(0, &swapped.busy)) {
+ __clear_bit(0, &swapped.busy);
goto out;
+ }
+
+ /*
+ * get_page_state is super expensive so we only perform it every
+ * SWAP_CLUSTER_MAX prefetched_pages. We also test if we're the only
+ * task running anywhere. We want to have as little impact on all
+ * resources (cpu, disk, bus etc). As this iterates over every cpu
+ * we measure this infrequently.
+ */
+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
+ unsigned long cpuload = nr_running();
+
+ if (cpuload > 1)
+ goto out;
+ cpuload += nr_uninterruptible();
+ if (cpuload > 1)
+ goto out;
+
+ test_pagestate = 1;
+ }
clear_current_prefetch_free();
@@ -263,18 +331,29 @@ static int prefetch_suitable(void)
* will occur to prevent ping-ponging between them.
*/
for_each_zone(z) {
+ struct node_stats *ns;
unsigned long free;
+ int idx;
if (!populated_zone(z))
continue;
+
node = z->zone_pgdat->node_id;
+ ns = &sp_stat.node[node];
+ idx = zone_idx(z);
free = z->free_pages;
- if (z->pages_high * 3 + z->lowmem_reserve[zone_idx(z)] > free) {
+ if (free < *ns->pointfree[idx]) {
+ /*
+ * Free pages have dropped below the low watermark so
+ * we won't start prefetching again till we hit the
+ * high watermark of free pages.
+ */
+ ns->pointfree[idx] = &ns->highfree[idx];
node_clear(node, sp_stat.prefetch_nodes);
continue;
}
- sp_stat.current_free[node] += free;
+ ns->current_free += free;
}
/*
@@ -282,28 +361,26 @@ static int prefetch_suitable(void)
* prefetching and clear the nodemask if it is not.
*/
for_each_node_mask(node, sp_stat.prefetch_nodes) {
+ struct node_stats *ns = &sp_stat.node[node];
+ struct page_state ps;
+
/*
* We check to see that pages are not being allocated
* elsewhere at any significant rate implying any
* degree of memory pressure (eg during file reads)
*/
- if (sp_stat.last_free[node]) {
- if (sp_stat.current_free[node] + SWAP_CLUSTER_MAX <
- sp_stat.last_free[node]) {
- sp_stat.last_free[node] =
- sp_stat.current_free[node];
+ if (ns->last_free) {
+ if (ns->current_free + SWAP_CLUSTER_MAX <
+ ns->last_free) {
+ ns->last_free = ns->current_free;
node_clear(node,
sp_stat.prefetch_nodes);
continue;
}
} else
- sp_stat.last_free[node] = sp_stat.current_free[node];
+ ns->last_free = ns->current_free;
- /*
- * get_page_state is super expensive so we only perform it
- * every SWAP_CLUSTER_MAX prefetched_pages
- */
- if (sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)
+ if (!test_pagestate)
continue;
get_page_state_node(&ps, node);
@@ -324,7 +401,7 @@ static int prefetch_suitable(void)
*/
limit = ps.nr_mapped + ps.nr_slab + ps.nr_dirty +
ps.nr_unstable + total_swapcache_pages;
- if (limit > sp_stat.prefetch_watermark[node]) {
+ if (limit > ns->prefetch_watermark) {
node_clear(node, sp_stat.prefetch_nodes);
continue;
}
@@ -370,6 +447,7 @@ static enum trickle_return trickle_swap(
if (!swap_prefetch || laptop_mode)
return ret;
+ examine_free_limits();
entry = NULL;
for ( ; ; ) {
@@ -459,8 +537,7 @@ static int kprefetchd(void *__unused)
*/
void __init prepare_swap_prefetch(void)
{
- pg_data_t *pgdat;
- int node;
+ struct zone *zone;
swapped.cache = kmem_cache_create("swapped_entry",
sizeof(struct swapped_entry), 0, SLAB_PANIC, NULL, NULL);
@@ -471,14 +548,19 @@ void __init prepare_swap_prefetch(void)
*/
swapped.maxcount = nr_free_pagecache_pages() / 3 * 2;
- for_each_online_pgdat(pgdat) {
+ for_each_zone(zone) {
unsigned long present;
+ struct node_stats *ns;
+ int idx;
- present = pgdat->node_present_pages;
+ present = zone->present_pages;
if (!present)
continue;
- node = pgdat->node_id;
- sp_stat.prefetch_watermark[node] += present / 3 * 2;
+
+ ns = &sp_stat.node[zone->zone_pgdat->node_id];
+ ns->prefetch_watermark += present / 3 * 2;
+ idx = zone_idx(zone);
+ ns->pointfree[idx] = &ns->highfree[idx];
}
}
Con Kolivas <[email protected]> wrote:
>
> + /*
> + * get_page_state is super expensive so we only perform it every
> + * SWAP_CLUSTER_MAX prefetched_pages.
nr_running() is similarly expensive btw.
> * We also test if we're the only
> + * task running anywhere. We want to have as little impact on all
> + * resources (cpu, disk, bus etc). As this iterates over every cpu
> + * we measure this infrequently.
> + */
> + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> + unsigned long cpuload = nr_running();
> +
> + if (cpuload > 1)
> + goto out;
Sorry, this is just wrong. If swap prefetch is useful then it's also
useful if some task happens to be sitting over in the corner calculating
pi.
What's the actual problem here? Someone's 3d game went blippy? Why? How
much? Are we missing a cond_resched()?
> + cpuload += nr_uninterruptible();
> + if (cpuload > 1)
> + goto out;
Not sure about this either.
> + if (ns->last_free) {
> + if (ns->current_free + SWAP_CLUSTER_MAX <
> + ns->last_free) {
> + ns->last_free = ns->current_free;
> node_clear(node,
> sp_stat.prefetch_nodes);
> continue;
> }
> } else
That has an extra tabstop.
Andrew Morton wrote:
> Con Kolivas <[email protected]> wrote:
>
>>+ /*
>>+ * get_page_state is super expensive so we only perform it every
>>+ * SWAP_CLUSTER_MAX prefetched_pages.
>
>
> nr_running() is similarly expensive btw.
>
>
>> * We also test if we're the only
>>+ * task running anywhere. We want to have as little impact on all
>>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
>>+ * we measure this infrequently.
>>+ */
>>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>+ unsigned long cpuload = nr_running();
>>+
>>+ if (cpuload > 1)
>>+ goto out;
>
>
> Sorry, this is just wrong. If swap prefetch is useful then it's also
> useful if some task happens to be sitting over in the corner calculating
> pi.
On SMP systems, something based on the run queues' raw_weighted_load
fields (comes with smpnice patch) might be more useful than nr_running()
as it contains information about the priority of the running tasks.
Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
where raw_weighted_load() is the sum of that field for all CPUs) would
suffice. It would mean "there's more than the equivalent of one nice==0
task running" and shouldn't be any more expensive than nr_running().
Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
to try as would taking into account this process's contribution to the
weighted load.
Also if this was useful there's no real reason that raw_weighted_load
couldn't be made available on non SMP systems as well as SMP ones.
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> Con Kolivas <[email protected]> wrote:
> > + /*
> > + * get_page_state is super expensive so we only perform it every
> > + * SWAP_CLUSTER_MAX prefetched_pages.
>
> nr_running() is similarly expensive btw.
Yes which is why I do it just as infrequently as get_page_state.
>
> > * We also test if we're the only
> > + * task running anywhere. We want to have as little impact on all
> > + * resources (cpu, disk, bus etc). As this iterates over every cpu
> > + * we measure this infrequently.
> > + */
> > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > + unsigned long cpuload = nr_running();
> > +
> > + if (cpuload > 1)
> > + goto out;
>
> Sorry, this is just wrong. If swap prefetch is useful then it's also
> useful if some task happens to be sitting over in the corner calculating
> pi.
>
> What's the actual problem here? Someone's 3d game went blippy? Why? How
> much? Are we missing a cond_resched()?
No, it's pretty easy to reproduce, kprefetchd sits there in uninterruptible
sleep with one cpu on SMP pegged at 100% iowait due to it. This tends to have
noticeable effects everywhere on HT or SMP. On UP the yielding helped it but
even then it still causes blips. How much? Well to be honest it's noticeable
a shipload. Running a game, any game, that uses 100% (and most fancy games
do) causes stuttering on audio, pauses and so on. This is evident on linux
native games, games under emulators or qemu and so on. That iowait really
hurts, and tweaking just priority doesn't help it in any way.
With this change it's much more polite and takes a bit longer to complete
prefetching but is still effective while no longer being noticeable.
> > + cpuload += nr_uninterruptible();
> > + if (cpuload > 1)
> > + goto out;
>
> Not sure about this either.
Same as above. It's the tasks in uninterruptible sleep that cause the most
harm. I do it sequentially simply because nr_running() is more likely to be
>1 than the sum total, and I'd prefer not to do nr_uninterruptible() unless
it's necessary. Both of these are actually done lockless though.
> > + if (ns->last_free) {
> > + if (ns->current_free + SWAP_CLUSTER_MAX <
> > + ns->last_free) {
> > + ns->last_free = ns->current_free;
> > node_clear(node,
> > sp_stat.prefetch_nodes);
> > continue;
> > }
> > } else
>
> That has an extra tabstop.
Hrm. 3 years on and I still make basi style mistakes.
Cheers,
Con
On Saturday 11 March 2006 10:11, Peter Williams wrote:
> Andrew Morton wrote:
> > Con Kolivas <[email protected]> wrote:
> >>+ /*
> >>+ * get_page_state is super expensive so we only perform it every
> >>+ * SWAP_CLUSTER_MAX prefetched_pages.
> >
> > nr_running() is similarly expensive btw.
> >
> >> * We also test if we're the only
> >>+ * task running anywhere. We want to have as little impact on all
> >>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
> >>+ * we measure this infrequently.
> >>+ */
> >>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> >>+ unsigned long cpuload = nr_running();
> >>+
> >>+ if (cpuload > 1)
> >>+ goto out;
> >
> > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > useful if some task happens to be sitting over in the corner calculating
> > pi.
>
> On SMP systems, something based on the run queues' raw_weighted_load
> fields (comes with smpnice patch) might be more useful than nr_running()
> as it contains information about the priority of the running tasks.
> Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
> where raw_weighted_load() is the sum of that field for all CPUs) would
> suffice. It would mean "there's more than the equivalent of one nice==0
> task running" and shouldn't be any more expensive than nr_running().
> Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
> to try as would taking into account this process's contribution to the
> weighted load.
>
> Also if this was useful there's no real reason that raw_weighted_load
> couldn't be made available on non SMP systems as well as SMP ones.
That does seem reasonable, but I'm looking at total system load, not per
runqueue. So a global_weighted_load() function would be required to return
that. Because despite what anyone seems to want to believe, reading from disk
hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE
with or without DMA issue. It's not about tweaking parameters. It doesn't
seem to be only about cpu cycles. This is not a mistuned system that it
happens on. It just plain hurts if we do lots of disk i/o, perhaps it's
saturating the bus or something. Whatever it is, as much as I'd _like_ swap
prefetch to just keep working quietly at ultra ultra low priority, the disk
reads that swap prefetch does are not innocuous so I really do want them to
only be done when nothing else wants cpu.
Cheers,
Con
Con Kolivas wrote:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
>
>>Andrew Morton wrote:
>>
>>>Con Kolivas <[email protected]> wrote:
>>>
>>>>+ /*
>>>>+ * get_page_state is super expensive so we only perform it every
>>>>+ * SWAP_CLUSTER_MAX prefetched_pages.
>>>
>>>nr_running() is similarly expensive btw.
>>>
>>>
>>>> * We also test if we're the only
>>>>+ * task running anywhere. We want to have as little impact on all
>>>>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
>>>>+ * we measure this infrequently.
>>>>+ */
>>>>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>>>+ unsigned long cpuload = nr_running();
>>>>+
>>>>+ if (cpuload > 1)
>>>>+ goto out;
>>>
>>>Sorry, this is just wrong. If swap prefetch is useful then it's also
>>>useful if some task happens to be sitting over in the corner calculating
>>>pi.
>>
>>On SMP systems, something based on the run queues' raw_weighted_load
>>fields (comes with smpnice patch) might be more useful than nr_running()
>>as it contains information about the priority of the running tasks.
>>Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
>>where raw_weighted_load() is the sum of that field for all CPUs) would
>>suffice. It would mean "there's more than the equivalent of one nice==0
>>task running" and shouldn't be any more expensive than nr_running().
>>Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
>>to try as would taking into account this process's contribution to the
>>weighted load.
>>
>>Also if this was useful there's no real reason that raw_weighted_load
>>couldn't be made available on non SMP systems as well as SMP ones.
>
>
> That does seem reasonable, but I'm looking at total system load, not per
> runqueue. So a global_weighted_load() function would be required to return
> that.
Yes. That's why I said "something based on".
> Because despite what anyone seems to want to believe, reading from disk
> hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE
> with or without DMA issue. It's not about tweaking parameters. It doesn't
> seem to be only about cpu cycles. This is not a mistuned system that it
> happens on. It just plain hurts if we do lots of disk i/o, perhaps it's
> saturating the bus or something. Whatever it is, as much as I'd _like_ swap
> prefetch to just keep working quietly at ultra ultra low priority, the disk
> reads that swap prefetch does are not innocuous so I really do want them to
> only be done when nothing else wants cpu.
Would you like to try a prototype version of the soft caps patch I'm
working on to see if it will help?
Peter
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Saturday 11 March 2006 15:28, Peter Williams wrote:
> Con Kolivas wrote:
> > Because despite what anyone seems to want to believe, reading from disk
> > hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs
> > IDE with or without DMA issue. It's not about tweaking parameters. It
> > doesn't seem to be only about cpu cycles. This is not a mistuned system
> > that it happens on. It just plain hurts if we do lots of disk i/o,
> > perhaps it's saturating the bus or something. Whatever it is, as much as
> > I'd _like_ swap prefetch to just keep working quietly at ultra ultra low
> > priority, the disk reads that swap prefetch does are not innocuous so I
> > really do want them to only be done when nothing else wants cpu.
I didn't make it clear here the things affected are not even doing any I/O of
their own. It's not about I/O resource allocation. However they are using
100% cpu and probably doing a lot of gpu bus traffic.
> Would you like to try a prototype version of the soft caps patch I'm
> working on to see if it will help?
What happens if it's using .01% cpu and spends most of its time in
uninterruptible sleep?
Cheers,
Con
On Saturday 11 March 2006 05:18, Con Kolivas wrote yet:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
> > Andrew Morton wrote:
> > > Con Kolivas <[email protected]> wrote:
> > >> * We also test if we're the only
> > >>+ * task running anywhere. We want to have as little impact on all
> > >>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
> > >>+ * we measure this infrequently.
> > >>+ */
> > >>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > >>+ unsigned long cpuload = nr_running();
> > >>+
> > >>+ if (cpuload > 1)
> > >>+ goto out;
> > >
> > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > useful if some task happens to be sitting over in the corner
> > > calculating pi.
> >
> > On SMP systems, something based on the run queues' raw_weighted_load
> > fields (comes with smpnice patch) might be more useful than nr_running()
> > as it contains information about the priority of the running tasks.
> > Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
> > where raw_weighted_load() is the sum of that field for all CPUs) would
> > suffice. It would mean "there's more than the equivalent of one nice==0
> > task running" and shouldn't be any more expensive than nr_running().
> > Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
> > to try as would taking into account this process's contribution to the
> > weighted load.
> >
> > Also if this was useful there's no real reason that raw_weighted_load
> > couldn't be made available on non SMP systems as well as SMP ones.
>
> That does seem reasonable, but I'm looking at total system load, not per
> runqueue. So a global_weighted_load() function would be required to return
> that. Because despite what anyone seems to want to believe, reading from
> disk hurts. Why it hurts so much I'm not really sure, but it's not a SCSI
> vs IDE with or without DMA issue. It's not about tweaking parameters. It
> doesn't seem to be only about cpu cycles. This is not a mistuned system
> that it happens on. It just plain hurts if we do lots of disk i/o, perhaps
> it's saturating the bus or something. Whatever it is, as much as I'd _like_
> swap prefetch to just keep working quietly at ultra ultra low priority, the
> disk reads that swap prefetch does are not innocuous so I really do want
> them to only be done when nothing else wants cpu.
Wouldn't the change break prefetching if I have 98% CPU time free and not
100%? Something like an audio player in the background?
It seems that any Seti@home type of calculation would kill it.
In reality, we don't want disk reads when something interactive is running, so
maybe you'd look at the nice level of the task?
(higher than x = don't count it?)
--
GPG Key id: 0xD1F10BA2
Fingerprint: 96E2 304A B9C4 949A 10A0 9105 9543 0453 D1F1 0BA2
AstralStorm
On Saturday 11 March 2006 16:04, Radoslaw Szkodzinski wrote:
> On Saturday 11 March 2006 05:18, Con Kolivas wrote yet:
> > On Saturday 11 March 2006 10:11, Peter Williams wrote:
> > > Andrew Morton wrote:
> > > > Con Kolivas <[email protected]> wrote:
> > > >> * We also test if we're the only
> > > >>+ * task running anywhere. We want to have as little impact on all
> > > >>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > >>+ * we measure this infrequently.
> > > >>+ */
> > > >>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > >>+ unsigned long cpuload = nr_running();
> > > >>+
> > > >>+ if (cpuload > 1)
> > > >>+ goto out;
> > > >
> > > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > >
> > > On SMP systems, something based on the run queues' raw_weighted_load
> > > fields (comes with smpnice patch) might be more useful than
> > > nr_running() as it contains information about the priority of the
> > > running tasks. Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some
> > > variation, where raw_weighted_load() is the sum of that field for all
> > > CPUs) would suffice. It would mean "there's more than the equivalent
> > > of one nice==0 task running" and shouldn't be any more expensive than
> > > nr_running(). Dividing SCHED_LOAD_SCALE by some number would be an
> > > obvious variation to try as would taking into account this process's
> > > contribution to the weighted load.
> > >
> > > Also if this was useful there's no real reason that raw_weighted_load
> > > couldn't be made available on non SMP systems as well as SMP ones.
> >
> > That does seem reasonable, but I'm looking at total system load, not per
> > runqueue. So a global_weighted_load() function would be required to
> > return that. Because despite what anyone seems to want to believe,
> > reading from disk hurts. Why it hurts so much I'm not really sure, but
> > it's not a SCSI vs IDE with or without DMA issue. It's not about tweaking
> > parameters. It doesn't seem to be only about cpu cycles. This is not a
> > mistuned system that it happens on. It just plain hurts if we do lots of
> > disk i/o, perhaps it's saturating the bus or something. Whatever it is,
> > as much as I'd _like_ swap prefetch to just keep working quietly at ultra
> > ultra low priority, the disk reads that swap prefetch does are not
> > innocuous so I really do want them to only be done when nothing else
> > wants cpu.
>
> Wouldn't the change break prefetching if I have 98% CPU time free and not
> 100%? Something like an audio player in the background?
That would only intermittently stop prefetching whenever both happened to use
cpu at exactly the same time (which is the desired effect). So playing audio
will slow prefetch a little but it will still prefetch.
> It seems that any Seti@home type of calculation would kill it.
> In reality, we don't want disk reads when something interactive is running,
> so maybe you'd look at the nice level of the task?
> (higher than x = don't count it?)
That's what Peter is promoting here. I could use the "weighted load" value to
determine just that, and keep running prefetch if highly niced tasks are
running. I am considering adding that in the future. For the moment I
definitely think opting out of prefetching whenever anything is running is
the right thing to do.
Cheers,
Con
Con Kolivas wrote:
> On Saturday 11 March 2006 15:28, Peter Williams wrote:
>
>>Con Kolivas wrote:
>>
>>>Because despite what anyone seems to want to believe, reading from disk
>>>hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs
>>>IDE with or without DMA issue. It's not about tweaking parameters. It
>>>doesn't seem to be only about cpu cycles. This is not a mistuned system
>>>that it happens on. It just plain hurts if we do lots of disk i/o,
>>>perhaps it's saturating the bus or something. Whatever it is, as much as
>>>I'd _like_ swap prefetch to just keep working quietly at ultra ultra low
>>>priority, the disk reads that swap prefetch does are not innocuous so I
>>>really do want them to only be done when nothing else wants cpu.
>
>
> I didn't make it clear here the things affected are not even doing any I/O of
> their own. It's not about I/O resource allocation. However they are using
> 100% cpu and probably doing a lot of gpu bus traffic.
>
>
>>Would you like to try a prototype version of the soft caps patch I'm
>>working on to see if it will help?
>
>
> What happens if it's using .01% cpu and spends most of its time in
> uninterruptible sleep?
Probably not much as I have to let tasks with a soft cap of zero get
some CPU to avoid problems with them holding resource other tasks may
need and 0.01% is probably as low as I can keep it anyway.
Just to clarify. At the moment, what I do to a task with a zero soft
cap is give them a priority one above MAX_PRIO (i.e. 2 higher than any
other task can have) and make sure they always go on the expired array
at the end of their time slice. They also get a load weight of zero to
prevent them getting a CPU to themselves. This means that any task that
becomes runnable on their CPU should preempt them and if they're the
only task on their CPU it will look idle and waking tasks may be moved
there if the other CPUs are idle. This may be enough to stop them
interfering with your game's tasks.
I'm currently letting them have a time slice determined by their nice in
an attempt to reduce context switching but this may change as it
probably allows them to get CPU access when there are non background
tasks on the expired array. I'm still thinking about how to prevent
this and keep context switching low.
Tasks with non zero soft caps go through a different process and (as far
as possible) tasks without soft caps avoid the capping code.
Peter
PS This is still work in progress.
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > Con Kolivas <[email protected]> wrote:
> > > + /*
> > > + * get_page_state is super expensive so we only perform it every
> > > + * SWAP_CLUSTER_MAX prefetched_pages.
> >
> > nr_running() is similarly expensive btw.
>
> Yes which is why I do it just as infrequently as get_page_state.
> >
> > > * We also test if we're the only
> > > + * task running anywhere. We want to have as little impact on all
> > > + * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > + * we measure this infrequently.
> > > + */
> > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > + unsigned long cpuload = nr_running();
> > > +
> > > + if (cpuload > 1)
> > > + goto out;
> >
> > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > useful if some task happens to be sitting over in the corner calculating
> > pi.
> >
> > What's the actual problem here? Someone's 3d game went blippy? Why? How
> > much? Are we missing a cond_resched()?
>
> No, it's pretty easy to reproduce, kprefetchd sits there in uninterruptible
> sleep with one cpu on SMP pegged at 100% iowait due to it. This tends to have
> noticeable effects everywhere on HT or SMP. On UP the yielding helped it but
> even then it still causes blips. How much? Well to be honest it's noticeable
> a shipload. Running a game, any game, that uses 100% (and most fancy games
> do) causes stuttering on audio, pauses and so on. This is evident on linux
> native games, games under emulators or qemu and so on. That iowait really
> hurts, and tweaking just priority doesn't help it in any way.
That doesn't really make sense to me. If a task can trigger audio
dropout and stalls by sleeping, we have a serious problem. In your
SMP/HT case, I'd start crawling over the load balancing code. I can't
see how trivial CPU with non-saturated IO can cause dropout in the UP
case either. Am I missing something?
-Mike
Con Kolivas wrote:
> On Saturday 11 March 2006 10:11, Peter Williams wrote:
>
>>Andrew Morton wrote:
>>
>>>Con Kolivas <[email protected]> wrote:
>>>
>>>>+ /*
>>>>+ * get_page_state is super expensive so we only perform it every
>>>>+ * SWAP_CLUSTER_MAX prefetched_pages.
>>>
>>>nr_running() is similarly expensive btw.
>>>
>>>
>>>> * We also test if we're the only
>>>>+ * task running anywhere. We want to have as little impact on all
>>>>+ * resources (cpu, disk, bus etc). As this iterates over every cpu
>>>>+ * we measure this infrequently.
>>>>+ */
>>>>+ if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
>>>>+ unsigned long cpuload = nr_running();
>>>>+
>>>>+ if (cpuload > 1)
>>>>+ goto out;
>>>
>>>Sorry, this is just wrong. If swap prefetch is useful then it's also
>>>useful if some task happens to be sitting over in the corner calculating
>>>pi.
>>
>>On SMP systems, something based on the run queues' raw_weighted_load
>>fields (comes with smpnice patch) might be more useful than nr_running()
>>as it contains information about the priority of the running tasks.
>>Perhaps (raw_weighted_load() > SCHED_LOAD_SCALE) or some variation,
>>where raw_weighted_load() is the sum of that field for all CPUs) would
>>suffice. It would mean "there's more than the equivalent of one nice==0
>>task running" and shouldn't be any more expensive than nr_running().
>>Dividing SCHED_LOAD_SCALE by some number would be an obvious variation
>>to try as would taking into account this process's contribution to the
>>weighted load.
>>
>>Also if this was useful there's no real reason that raw_weighted_load
>>couldn't be made available on non SMP systems as well as SMP ones.
>
>
> That does seem reasonable, but I'm looking at total system load, not per
> runqueue. So a global_weighted_load() function would be required to return
> that.
Just another thought here. Any function such as this and nr_running()
will be highly racy unless you lock all run queues while running it and
while you perform the action dependent on the result (which I presume
you don't do). This means the answer you get back is probably wrong by
the time you make a decision based on the answer.
So is there any reason that you can't make the decision inside the loop
iterating over the CPUs on a per CPU basis? This would remove the
raciness. The only thing that I can think of is that you're trying to
avoid the cost of that loop but you'll wear most of that running
global_weighted_load() or nr_running() anyway.
> Because despite what anyone seems to want to believe, reading from disk
> hurts. Why it hurts so much I'm not really sure, but it's not a SCSI vs IDE
> with or without DMA issue. It's not about tweaking parameters. It doesn't
> seem to be only about cpu cycles. This is not a mistuned system that it
> happens on. It just plain hurts if we do lots of disk i/o, perhaps it's
> saturating the bus or something. Whatever it is, as much as I'd _like_ swap
> prefetch to just keep working quietly at ultra ultra low priority, the disk
> reads that swap prefetch does are not innocuous so I really do want them to
> only be done when nothing else wants cpu.
>
> Cheers,
> Con
--
Peter Williams [email protected]
"Learning, n. The kind of ignorance distinguishing the studious."
-- Ambrose Bierce
On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > Con Kolivas <[email protected]> wrote:
> > > > + /*
> > > > + * get_page_state is super expensive so we only perform it every
> > > > + * SWAP_CLUSTER_MAX prefetched_pages.
> > >
> > > nr_running() is similarly expensive btw.
> >
> > Yes which is why I do it just as infrequently as get_page_state.
> >
> > > > * We also test if we're the only
> > > > + * task running anywhere. We want to have as little impact on all
> > > > + * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > + * we measure this infrequently.
> > > > + */
> > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > + unsigned long cpuload = nr_running();
> > > > +
> > > > + if (cpuload > 1)
> > > > + goto out;
> > >
> > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > useful if some task happens to be sitting over in the corner
> > > calculating pi.
> > >
> > > What's the actual problem here? Someone's 3d game went blippy? Why?
> > > How much? Are we missing a cond_resched()?
> >
> > No, it's pretty easy to reproduce, kprefetchd sits there in
> > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > the yielding helped it but even then it still causes blips. How much?
> > Well to be honest it's noticeable a shipload. Running a game, any game,
> > that uses 100% (and most fancy games do) causes stuttering on audio,
> > pauses and so on. This is evident on linux native games, games under
> > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > priority doesn't help it in any way.
>
> That doesn't really make sense to me. If a task can trigger audio
> dropout and stalls by sleeping, we have a serious problem. In your
> SMP/HT case, I'd start crawling over the load balancing code. I can't
> see how trivial CPU with non-saturated IO can cause dropout in the UP
> case either. Am I missing something?
Clearly you, me and everyone else is missing something. I see it with each
task bound to one cpu with cpu affinity so it's not a balancing issue. Try it
yourself if you can instead of not believing me. Get a big dd reader
(virtually no cpu and all io wait sleep) on one cpu and try and play a game
on the other cpu. It dies rectally.
Cheers,
Con
On Saturday 11 March 2006 16:50, Con Kolivas wrote:
> On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > Con Kolivas <[email protected]> wrote:
> > > > > + /*
> > > > > + * get_page_state is super expensive so we only perform it every
> > > > > + * SWAP_CLUSTER_MAX prefetched_pages.
> > > >
> > > > nr_running() is similarly expensive btw.
> > >
> > > Yes which is why I do it just as infrequently as get_page_state.
> > >
> > > > > * We also test if we're the only
> > > > > + * task running anywhere. We want to have as little impact on all
> > > > > + * resources (cpu, disk, bus etc). As this iterates over every
> > > > > cpu + * we measure this infrequently.
> > > > > + */
> > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > + unsigned long cpuload = nr_running();
> > > > > +
> > > > > + if (cpuload > 1)
> > > > > + goto out;
> > > >
> > > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > > >
> > > > What's the actual problem here? Someone's 3d game went blippy? Why?
> > > > How much? Are we missing a cond_resched()?
> > >
> > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > it. This tends to have noticeable effects everywhere on HT or SMP. On
> > > UP the yielding helped it but even then it still causes blips. How
> > > much? Well to be honest it's noticeable a shipload. Running a game, any
> > > game, that uses 100% (and most fancy games do) causes stuttering on
> > > audio, pauses and so on. This is evident on linux native games, games
> > > under emulators or qemu and so on. That iowait really hurts, and
> > > tweaking just priority doesn't help it in any way.
> >
> > That doesn't really make sense to me. If a task can trigger audio
> > dropout and stalls by sleeping, we have a serious problem. In your
> > SMP/HT case, I'd start crawling over the load balancing code. I can't
> > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > case either. Am I missing something?
>
> Clearly you, me and everyone else is missing something. I see it with each
> task bound to one cpu with cpu affinity so it's not a balancing issue. Try
> it yourself if you can instead of not believing me. Get a big dd reader
> (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> on the other cpu. It dies rectally.
I happen to have a tool to instrument this as you're probably aware
(interbench). Here is an old log I found of this.:
--- Benchmarking simulated cpu of Gaming in the presence of simulated ---
Load Latency +/- SD (ms) Max Latency % Desired CPU
None 0 +/- 0 0 100
Write 36.5 +/- 103 966 73.3
Read 17.2 +/- 22.9 244 85.3
Note the max latency being massive and desired cpu dropping. This is on a HT
machine.
Cheers,
Con
On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > Con Kolivas <[email protected]> wrote:
> > > > > + /*
> > > > > + * get_page_state is super expensive so we only perform it every
> > > > > + * SWAP_CLUSTER_MAX prefetched_pages.
> > > >
> > > > nr_running() is similarly expensive btw.
> > >
> > > Yes which is why I do it just as infrequently as get_page_state.
> > >
> > > > > * We also test if we're the only
> > > > > + * task running anywhere. We want to have as little impact on all
> > > > > + * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > > + * we measure this infrequently.
> > > > > + */
> > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > + unsigned long cpuload = nr_running();
> > > > > +
> > > > > + if (cpuload > 1)
> > > > > + goto out;
> > > >
> > > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > > useful if some task happens to be sitting over in the corner
> > > > calculating pi.
> > > >
> > > > What's the actual problem here? Someone's 3d game went blippy? Why?
> > > > How much? Are we missing a cond_resched()?
> > >
> > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > > the yielding helped it but even then it still causes blips. How much?
> > > Well to be honest it's noticeable a shipload. Running a game, any game,
> > > that uses 100% (and most fancy games do) causes stuttering on audio,
> > > pauses and so on. This is evident on linux native games, games under
> > > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > > priority doesn't help it in any way.
> >
> > That doesn't really make sense to me. If a task can trigger audio
> > dropout and stalls by sleeping, we have a serious problem. In your
> > SMP/HT case, I'd start crawling over the load balancing code. I can't
> > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > case either. Am I missing something?
>
> Clearly you, me and everyone else is missing something. I see it with each
> task bound to one cpu with cpu affinity so it's not a balancing issue. Try it
> yourself if you can instead of not believing me. Get a big dd reader
> (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> on the other cpu. It dies rectally.
I said it didn't make sense to me, not that I didn't believe you. If I
had a real SMP box, I would look into it, but all I have is HT.
If you're creating a lot of traffic, I can see it causing problems. I
was under the impression that you were doing minimal IO and absolutely
trivial CPU. That's what didn't make sense to me to be clear.
-Mike
On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <[email protected]> wrote:
> > > > > > + /*
> > > > > > + * get_page_state is super expensive so we only perform it every
> > > > > > + * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > * We also test if we're the only
> > > > > > + * task running anywhere. We want to have as little impact on all
> > > > > > + * resources (cpu, disk, bus etc). As this iterates over every cpu
> > > > > > + * we measure this infrequently.
> > > > > > + */
> > > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > + unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > + if (cpuload > 1)
> > > > > > + goto out;
> > > > >
> > > > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > > > useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here? Someone's 3d game went blippy? Why?
> > > > > How much? Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > > it. This tends to have noticeable effects everywhere on HT or SMP. On UP
> > > > the yielding helped it but even then it still causes blips. How much?
> > > > Well to be honest it's noticeable a shipload. Running a game, any game,
> > > > that uses 100% (and most fancy games do) causes stuttering on audio,
> > > > pauses and so on. This is evident on linux native games, games under
> > > > emulators or qemu and so on. That iowait really hurts, and tweaking just
> > > > priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me. If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem. In your
> > > SMP/HT case, I'd start crawling over the load balancing code. I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either. Am I missing something?
> >
> > Clearly you, me and everyone else is missing something. I see it with each
> > task bound to one cpu with cpu affinity so it's not a balancing issue. Try it
> > yourself if you can instead of not believing me. Get a big dd reader
> > (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> > on the other cpu. It dies rectally.
>
> I said it didn't make sense to me, not that I didn't believe you. If I
> had a real SMP box, I would look into it, but all I have is HT.
>
> If you're creating a lot of traffic, I can see it causing problems. I
> was under the impression that you were doing minimal IO and absolutely
> trivial CPU. That's what didn't make sense to me to be clear.
>
> -Mike
P.S. If it's hefty IO, it makes sense, and having the ability to do PIO
instead of DMA would be probably help.
On Sat, 2006-03-11 at 16:58 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 16:50, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <[email protected]> wrote:
> > > > > > + /*
> > > > > > + * get_page_state is super expensive so we only perform it every
> > > > > > + * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > * We also test if we're the only
> > > > > > + * task running anywhere. We want to have as little impact on all
> > > > > > + * resources (cpu, disk, bus etc). As this iterates over every
> > > > > > cpu + * we measure this infrequently.
> > > > > > + */
> > > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > + unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > + if (cpuload > 1)
> > > > > > + goto out;
> > > > >
> > > > > Sorry, this is just wrong. If swap prefetch is useful then it's also
> > > > > useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here? Someone's 3d game went blippy? Why?
> > > > > How much? Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due to
> > > > it. This tends to have noticeable effects everywhere on HT or SMP. On
> > > > UP the yielding helped it but even then it still causes blips. How
> > > > much? Well to be honest it's noticeable a shipload. Running a game, any
> > > > game, that uses 100% (and most fancy games do) causes stuttering on
> > > > audio, pauses and so on. This is evident on linux native games, games
> > > > under emulators or qemu and so on. That iowait really hurts, and
> > > > tweaking just priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me. If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem. In your
> > > SMP/HT case, I'd start crawling over the load balancing code. I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either. Am I missing something?
> >
> > Clearly you, me and everyone else is missing something. I see it with each
> > task bound to one cpu with cpu affinity so it's not a balancing issue. Try
> > it yourself if you can instead of not believing me. Get a big dd reader
> > (virtually no cpu and all io wait sleep) on one cpu and try and play a game
> > on the other cpu. It dies rectally.
>
> I happen to have a tool to instrument this as you're probably aware
> (interbench). Here is an old log I found of this.:
Yeah, I have a copy. Interpreting the results isn't necessarily easy
though, just as any other benchmark.
>
> --- Benchmarking simulated cpu of Gaming in the presence of simulated ---
> Load Latency +/- SD (ms) Max Latency % Desired CPU
> None 0 +/- 0 0 100
> Write 36.5 +/- 103 966 73.3
> Read 17.2 +/- 22.9 244 85.3
>
> Note the max latency being massive and desired cpu dropping. This is on a HT
> machine.
I wonder what that would look like with two real CPUs.
-Mike
On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > > Con Kolivas <[email protected]> wrote:
> > > > > > > + /*
> > > > > > > + * get_page_state is super expensive so we only perform it
> > > > > > > every + * SWAP_CLUSTER_MAX prefetched_pages.
> > > > > >
> > > > > > nr_running() is similarly expensive btw.
> > > > >
> > > > > Yes which is why I do it just as infrequently as get_page_state.
> > > > >
> > > > > > > * We also test if we're the only
> > > > > > > + * task running anywhere. We want to have as little impact on
> > > > > > > all + * resources (cpu, disk, bus etc). As this iterates over
> > > > > > > every cpu + * we measure this infrequently.
> > > > > > > + */
> > > > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > > + unsigned long cpuload = nr_running();
> > > > > > > +
> > > > > > > + if (cpuload > 1)
> > > > > > > + goto out;
> > > > > >
> > > > > > Sorry, this is just wrong. If swap prefetch is useful then it's
> > > > > > also useful if some task happens to be sitting over in the corner
> > > > > > calculating pi.
> > > > > >
> > > > > > What's the actual problem here? Someone's 3d game went blippy?
> > > > > > Why? How much? Are we missing a cond_resched()?
> > > > >
> > > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > > to it. This tends to have noticeable effects everywhere on HT or
> > > > > SMP. On UP the yielding helped it but even then it still causes
> > > > > blips. How much? Well to be honest it's noticeable a shipload.
> > > > > Running a game, any game, that uses 100% (and most fancy games do)
> > > > > causes stuttering on audio, pauses and so on. This is evident on
> > > > > linux native games, games under emulators or qemu and so on. That
> > > > > iowait really hurts, and tweaking just priority doesn't help it in
> > > > > any way.
> > > >
> > > > That doesn't really make sense to me. If a task can trigger audio
> > > > dropout and stalls by sleeping, we have a serious problem. In your
> > > > SMP/HT case, I'd start crawling over the load balancing code. I
> > > > can't see how trivial CPU with non-saturated IO can cause dropout in
> > > > the UP case either. Am I missing something?
> > >
> > > Clearly you, me and everyone else is missing something. I see it with
> > > each task bound to one cpu with cpu affinity so it's not a balancing
> > > issue. Try it yourself if you can instead of not believing me. Get a
> > > big dd reader (virtually no cpu and all io wait sleep) on one cpu and
> > > try and play a game on the other cpu. It dies rectally.
> >
> > I said it didn't make sense to me, not that I didn't believe you. If I
> > had a real SMP box, I would look into it, but all I have is HT.
> >
> > If you're creating a lot of traffic, I can see it causing problems. I
> > was under the impression that you were doing minimal IO and absolutely
> > trivial CPU. That's what didn't make sense to me to be clear.
> P.S. If it's hefty IO, it makes sense, and having the ability to do PIO
> instead of DMA would be probably help.
That would probably be worse, because then it would use much more cpu in the
form of kernel context time and not be attributed to kprefetchd at all.
Anyway this is clearly not a workaround (yes I do know you weren't promoting
it as such).
Cheers,
Con
On Saturday 11 March 2006 17:00, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > Con Kolivas <[email protected]> wrote:
> > > > > > + /*
> > > > > > + * get_page_state is super expensive so we only perform it
> > > > > > every + * SWAP_CLUSTER_MAX prefetched_pages.
> > > > >
> > > > > nr_running() is similarly expensive btw.
> > > >
> > > > Yes which is why I do it just as infrequently as get_page_state.
> > > >
> > > > > > * We also test if we're the only
> > > > > > + * task running anywhere. We want to have as little impact on
> > > > > > all + * resources (cpu, disk, bus etc). As this iterates over
> > > > > > every cpu + * we measure this infrequently.
> > > > > > + */
> > > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > + unsigned long cpuload = nr_running();
> > > > > > +
> > > > > > + if (cpuload > 1)
> > > > > > + goto out;
> > > > >
> > > > > Sorry, this is just wrong. If swap prefetch is useful then it's
> > > > > also useful if some task happens to be sitting over in the corner
> > > > > calculating pi.
> > > > >
> > > > > What's the actual problem here? Someone's 3d game went blippy?
> > > > > Why? How much? Are we missing a cond_resched()?
> > > >
> > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > to it. This tends to have noticeable effects everywhere on HT or SMP.
> > > > On UP the yielding helped it but even then it still causes blips. How
> > > > much? Well to be honest it's noticeable a shipload. Running a game,
> > > > any game, that uses 100% (and most fancy games do) causes stuttering
> > > > on audio, pauses and so on. This is evident on linux native games,
> > > > games under emulators or qemu and so on. That iowait really hurts,
> > > > and tweaking just priority doesn't help it in any way.
> > >
> > > That doesn't really make sense to me. If a task can trigger audio
> > > dropout and stalls by sleeping, we have a serious problem. In your
> > > SMP/HT case, I'd start crawling over the load balancing code. I can't
> > > see how trivial CPU with non-saturated IO can cause dropout in the UP
> > > case either. Am I missing something?
> >
> > Clearly you, me and everyone else is missing something. I see it with
> > each task bound to one cpu with cpu affinity so it's not a balancing
> > issue. Try it yourself if you can instead of not believing me. Get a big
> > dd reader (virtually no cpu and all io wait sleep) on one cpu and try and
> > play a game on the other cpu. It dies rectally.
>
> I said it didn't make sense to me, not that I didn't believe you. If I
> had a real SMP box, I would look into it, but all I have is HT.
No doubt it would be better on an SMP box. The norm is, however, for all these
multi-core, multi-threading cpus to be more common than real SMP and they all
share varying amounts of their resources.
> If you're creating a lot of traffic, I can see it causing problems. I
> was under the impression that you were doing minimal IO and absolutely
> trivial CPU. That's what didn't make sense to me to be clear.
A lot of cpu would be easier to handle; it's using absolutely miniscule
amounts of cpu. The IO is massive though (and seeky in nature), and reading
from a swap partition seems particularly expensive in this regard.
Cheers,
Con
On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> > > On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> > > > On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> > > > > On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> > > > > > On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> > > > > > > Con Kolivas <[email protected]> wrote:
> > > > > > > > + /*
> > > > > > > > + * get_page_state is super expensive so we only perform it
> > > > > > > > every + * SWAP_CLUSTER_MAX prefetched_pages.
> > > > > > >
> > > > > > > nr_running() is similarly expensive btw.
> > > > > >
> > > > > > Yes which is why I do it just as infrequently as get_page_state.
> > > > > >
> > > > > > > > * We also test if we're the only
> > > > > > > > + * task running anywhere. We want to have as little impact on
> > > > > > > > all + * resources (cpu, disk, bus etc). As this iterates over
> > > > > > > > every cpu + * we measure this infrequently.
> > > > > > > > + */
> > > > > > > > + if (!(sp_stat.prefetched_pages % SWAP_CLUSTER_MAX)) {
> > > > > > > > + unsigned long cpuload = nr_running();
> > > > > > > > +
> > > > > > > > + if (cpuload > 1)
> > > > > > > > + goto out;
> > > > > > >
> > > > > > > Sorry, this is just wrong. If swap prefetch is useful then it's
> > > > > > > also useful if some task happens to be sitting over in the corner
> > > > > > > calculating pi.
> > > > > > >
> > > > > > > What's the actual problem here? Someone's 3d game went blippy?
> > > > > > > Why? How much? Are we missing a cond_resched()?
> > > > > >
> > > > > > No, it's pretty easy to reproduce, kprefetchd sits there in
> > > > > > uninterruptible sleep with one cpu on SMP pegged at 100% iowait due
> > > > > > to it. This tends to have noticeable effects everywhere on HT or
> > > > > > SMP. On UP the yielding helped it but even then it still causes
> > > > > > blips. How much? Well to be honest it's noticeable a shipload.
> > > > > > Running a game, any game, that uses 100% (and most fancy games do)
> > > > > > causes stuttering on audio, pauses and so on. This is evident on
> > > > > > linux native games, games under emulators or qemu and so on. That
> > > > > > iowait really hurts, and tweaking just priority doesn't help it in
> > > > > > any way.
> > > > >
> > > > > That doesn't really make sense to me. If a task can trigger audio
> > > > > dropout and stalls by sleeping, we have a serious problem. In your
> > > > > SMP/HT case, I'd start crawling over the load balancing code. I
> > > > > can't see how trivial CPU with non-saturated IO can cause dropout in
> > > > > the UP case either. Am I missing something?
> > > >
> > > > Clearly you, me and everyone else is missing something. I see it with
> > > > each task bound to one cpu with cpu affinity so it's not a balancing
> > > > issue. Try it yourself if you can instead of not believing me. Get a
> > > > big dd reader (virtually no cpu and all io wait sleep) on one cpu and
> > > > try and play a game on the other cpu. It dies rectally.
> > >
> > > I said it didn't make sense to me, not that I didn't believe you. If I
> > > had a real SMP box, I would look into it, but all I have is HT.
> > >
> > > If you're creating a lot of traffic, I can see it causing problems. I
> > > was under the impression that you were doing minimal IO and absolutely
> > > trivial CPU. That's what didn't make sense to me to be clear.
>
> > P.S. If it's hefty IO, it makes sense, and having the ability to do PIO
> > instead of DMA would be probably help.
>
> That would probably be worse, because then it would use much more cpu in the
> form of kernel context time and not be attributed to kprefetchd at all.
> Anyway this is clearly not a workaround (yes I do know you weren't promoting
> it as such).
Substitute PIO with trickle mode IO, which we don't have an AFAIK.
Point was, if it's hefty IO, the problem is likely DMA, so what you'd
need to do is prevent the IO from being consolidated into mondo blocks
of DMA==bus contention. Doing that via yield or whatever would be the
wrong approach to the problem.
-Mike
On Sat, 2006-03-11 at 18:24 +1100, Con Kolivas wrote:
> On Saturday 11 March 2006 17:00, Mike Galbraith wrote:
> > If you're creating a lot of traffic, I can see it causing problems. I
> > was under the impression that you were doing minimal IO and absolutely
> > trivial CPU. That's what didn't make sense to me to be clear.
>
> A lot of cpu would be easier to handle; it's using absolutely miniscule
> amounts of cpu. The IO is massive though (and seeky in nature), and reading
> from a swap partition seems particularly expensive in this regard.
There used to be a pages in flight 'restrictor plate' in there that
would have probably helped this situation at least a little. But in any
case, it sounds like you'll have to find a way to submit the IO in itty
bitty synchronous pieces.
-Mike
Mike Galbraith wrote:
> On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
>>On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
>>>On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
>>>>On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
>>>>>On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
>>>>>>On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
>>>>>>>On Saturday 11 March 2006 09:35, Andrew Morton wrote:
>>>>>>>>Con Kolivas <[email protected]> wrote:
So... you guys ever think about trimming this? Not only would
it be faster to read, you can save the list server about 15MB
worth of email a pop with just a small haircut.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Saturday 11 March 2006 18:51, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 18:24 +1100, Con Kolivas wrote:
> > On Saturday 11 March 2006 17:00, Mike Galbraith wrote:
> > > If you're creating a lot of traffic, I can see it causing problems. I
> > > was under the impression that you were doing minimal IO and absolutely
> > > trivial CPU. That's what didn't make sense to me to be clear.
> >
> > A lot of cpu would be easier to handle; it's using absolutely miniscule
> > amounts of cpu. The IO is massive though (and seeky in nature), and
> > reading from a swap partition seems particularly expensive in this
> > regard.
>
> There used to be a pages in flight 'restrictor plate' in there that
> would have probably helped this situation at least a little. But in any
> case, it sounds like you'll have to find a way to submit the IO in itty
> bitty synchronous pieces.
Well the original code used to have an heuristic to decide how much to
prefetch at a time. It was considered opaque so I removed it. It made the
amount to prefetch proportional to amount of ram which is wrong of course
because it should depend more on swap partition read speed vs bus bandwidth
or something.
This way of deciding based on cpu load works anyway but yet again seems
unpopular.
Cheers,
Con
On Sat, 2006-03-11 at 19:16 +1100, Nick Piggin wrote:
> Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 18:20 +1100, Con Kolivas wrote:
> >>On Saturday 11 March 2006 17:05, Mike Galbraith wrote:
> >>>On Sat, 2006-03-11 at 07:00 +0100, Mike Galbraith wrote:
> >>>>On Sat, 2006-03-11 at 16:50 +1100, Con Kolivas wrote:
> >>>>>On Saturday 11 March 2006 16:33, Mike Galbraith wrote:
> >>>>>>On Sat, 2006-03-11 at 14:50 +1100, Con Kolivas wrote:
> >>>>>>>On Saturday 11 March 2006 09:35, Andrew Morton wrote:
> >>>>>>>>Con Kolivas <[email protected]> wrote:
>
> So... you guys ever think about trimming this? Not only would
> it be faster to read, you can save the list server about 15MB
> worth of email a pop with just a small haircut.
>
Sorry, was doing too many things at once to notice. I think we're about
done yacking anyway.
-Mike
On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> There used to be a pages in flight 'restrictor plate' in there that
> would have probably helped this situation at least a little. But in
> any case, it sounds like you'll have to find a way to submit the IO in
> itty bitty synchronous pieces.
echo 64 > /sys/block/hd*/queue/max_sectors_kb
There is basically a straight linear relation between whatever you set
this to and the maximum scheduling latency you see. It was developed to
solve the exact problem you are describing.
Lee
On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > There used to be a pages in flight 'restrictor plate' in there that
> > would have probably helped this situation at least a little. But in
> > any case, it sounds like you'll have to find a way to submit the IO in
> > itty bitty synchronous pieces.
>
> echo 64 > /sys/block/hd*/queue/max_sectors_kb
>
> There is basically a straight linear relation between whatever you set
> this to and the maximum scheduling latency you see. It was developed to
> solve the exact problem you are describing.
Ah, a very useful bit of information, thanks.
It won't help Con though, because he'll be dealing with every possible
configuration. I think he's going to have to either submit, wait,
bandwidth limiting sleep, repeat or something clever that does that.
Even with bandwidth restriction though, seek still bites mightily, so I
suspect he's stuck with little trickles of IO started when we'd
otherwise be idle. We'll see I suppose.
-Mike
On Sunday 12 March 2006 16:27, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> >
> > There is basically a straight linear relation between whatever you set
> > this to and the maximum scheduling latency you see. It was developed to
> > solve the exact problem you are describing.
>
> Ah, a very useful bit of information, thanks.
>
> It won't help Con though, because he'll be dealing with every possible
> configuration. I think he's going to have to either submit, wait,
> bandwidth limiting sleep, repeat or something clever that does that.
> Even with bandwidth restriction though, seek still bites mightily, so I
> suspect he's stuck with little trickles of IO started when we'd
> otherwise be idle. We'll see I suppose.
What I'm doing with that last patch works fine - don't prefetch if anything
else is running. Prefetching is not a performance critical function and we
cannot know what tasks are scheduling latency sensitive. With that latest
patch the most expensive thing is doing nr_running(). Assuming anything is
running, it only needs to do that once every 5 seconds - and only after
something is in swap. Furthermore it doesn't do it if swap prefetch is
disabled with the tunable. I don't think this is an expensive operation in
that context and certainly avoids any problems with it.
I could hack in a weighted load variant of it so that prefetch does run when
only nice 19 tasks are running on top of it so that perhaps low priority
compiles, distributed computing clients et al don't prevent prefetching from
happening - I could do this on top of the current patch. I'd like to see that
last patch go in. Does anyone have another alternative?
Cheers,
Con
On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > There used to be a pages in flight 'restrictor plate' in there that
> > would have probably helped this situation at least a little. But in
> > any case, it sounds like you'll have to find a way to submit the IO in
> > itty bitty synchronous pieces.
>
> echo 64 > /sys/block/hd*/queue/max_sectors_kb
>
> There is basically a straight linear relation between whatever you set
> this to and the maximum scheduling latency you see. It was developed to
> solve the exact problem you are describing.
<head-scratching>
Is it possible that you mean pci latency? I'm unable to measure any
scheduling latency > 5ms while pushing IO for all my little Barracuda
disk is worth. I _can_ generate mp3 player audio dropout though,
despite mp3 files living on a separate drive/controller.
-Mike
On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > > There used to be a pages in flight 'restrictor plate' in there that
> > > would have probably helped this situation at least a little. But in
> > > any case, it sounds like you'll have to find a way to submit the IO in
> > > itty bitty synchronous pieces.
> >
> > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> >
> > There is basically a straight linear relation between whatever you set
> > this to and the maximum scheduling latency you see. It was developed to
> > solve the exact problem you are describing.
>
> <head-scratching>
>
> Is it possible that you mean pci latency? I'm unable to measure any
> scheduling latency > 5ms while pushing IO for all my little Barracuda
> disk is worth.
It's only a big problem if LBA48 is in use which allows 32MB of IO to be
in flight at once, this depends on the size of the drive.
What does that value default to?
> I _can_ generate mp3 player audio dropout though,
> despite mp3 files living on a separate drive/controller.
>
Does this go away if you run the mp3 player at nice -20?
> -Mike
>
>
On Tue, 2006-03-14 at 01:50 -0500, Lee Revell wrote:
> On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> >> >
> > > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> > >
> > > There is basically a straight linear relation between whatever you set
> > > this to and the maximum scheduling latency you see. It was developed to
> > > solve the exact problem you are describing.
> >
> > <head-scratching>
> >
> > Is it possible that you mean pci latency? I'm unable to measure any
> > scheduling latency > 5ms while pushing IO for all my little Barracuda
> > disk is worth.
>
> It's only a big problem if LBA48 is in use which allows 32MB of IO to be
> in flight at once, this depends on the size of the drive.
This is a 120G drive.
>
> What does that value default to?
512.
> > I _can_ generate mp3 player audio dropout though,
> > despite mp3 files living on a separate drive/controller.
> >
>
> Does this go away if you run the mp3 player at nice -20?
Nope.
-Mike
On Tue, Mar 14 2006, Lee Revell wrote:
> On Tue, 2006-03-14 at 07:40 +0100, Mike Galbraith wrote:
> > On Sat, 2006-03-11 at 23:54 -0500, Lee Revell wrote:
> > > On Sat, 2006-03-11 at 08:51 +0100, Mike Galbraith wrote:
> > > > There used to be a pages in flight 'restrictor plate' in there that
> > > > would have probably helped this situation at least a little. But in
> > > > any case, it sounds like you'll have to find a way to submit the IO in
> > > > itty bitty synchronous pieces.
> > >
> > > echo 64 > /sys/block/hd*/queue/max_sectors_kb
> > >
> > > There is basically a straight linear relation between whatever you set
> > > this to and the maximum scheduling latency you see. It was developed to
> > > solve the exact problem you are describing.
> >
> > <head-scratching>
> >
> > Is it possible that you mean pci latency? I'm unable to measure any
> > scheduling latency > 5ms while pushing IO for all my little Barracuda
> > disk is worth.
>
> It's only a big problem if LBA48 is in use which allows 32MB of IO to be
> in flight at once, this depends on the size of the drive.
>
> What does that value default to?
Not quite true. Even if lba48 is active on the drive, we don't allow
more than 1MB per request. And nit picking a little, lba48 doesn't
always depend on the size of the drive, some drives smaller than 2^28
sectors also feature lba48 support.
--
Jens Axboe
On Tue, 2006-03-14 at 08:06 +0100, Mike Galbraith wrote:
> On Tue, 2006-03-14 at 01:50 -0500, Lee Revell wrote:
> > Does this go away if you run the mp3 player at nice -20?
>
> Nope.
But it does go away if I change from amarok to xmms, so amarok is
probably just not buffering quite enough. OTOH, xmms seems to be picky
in other respects. During heavy disk IO, it'll gripe about my soundcard
not being ready while switching songs, retry by poking the play button,
and all is fine. Hohum.
Anyway, seems I can't reproduce the really bad stuff here, so no can
tinker with.