Attached patch implements an O_STREAMING file I/O flag which enables
manual drop-behind of pages.
If the file has O_STREAMING set then the user has explicitly said "this
is streaming data, I know I will not revisit this, do not cache
anything". So we drop pages from the pagecache before our current
index. We have to fiddle a bit to get writes working since we do
write-behind but the logic is there and it works.
Some numbers. A simple streaming read to verify the pagecache effects:
Streaming 1GB Read (avg of many runs, mem=2GB):
O_STREAMING Wall time Change in Page Cache
Yes 25.58s 0
No 25.55s +835MB
Another read with much more VM pressure:
Streaming 1GB Read (avg of many runs, mem=8M)
O_STREAMING Wall time Change in Page Cache
Yes 25.76s 0
No 29.01s +1MB
And now the kicker:
Kernel compile (make -j2) and concurrent streaming I/O
(avg of two runs, mem=128M):
O_STREAMING Time to complete Kernel Compile
Yes 3m27.863s
No 4m15.818s
This is c/o Andrew Morton.
Patch is against 2.4.20-pre9. Why not 2.5? Because Andrew says we can
do better, perhaps with a real drop-behind heuristic. As 20 Oct looms
quite close, we shall see.
Robert Love
Implement O_STREAMING for streaming I/O for manual drop-behind of pages.
include/asm-arm/fcntl.h | 1
include/asm-i386/fcntl.h | 1
include/asm-mips/fcntl.h | 1
include/asm-ppc/fcntl.h | 1
include/asm-sh/fcntl.h | 1
mm/filemap.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 94 insertions(+)
diff -urN linux-2.4.20-pre9/include/asm-arm/fcntl.h linux/include/asm-arm/fcntl.h
--- linux-2.4.20-pre9/include/asm-arm/fcntl.h 2002-10-06 14:57:26.000000000 -0400
+++ linux/include/asm-arm/fcntl.h 2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
#define O_NOFOLLOW 0100000 /* don't follow links */
#define O_DIRECT 0200000 /* direct disk access hint - currently ignored */
#define O_LARGEFILE 0400000
+#define O_STREAMING 04000000 /* streaming access */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-i386/fcntl.h linux/include/asm-i386/fcntl.h
--- linux-2.4.20-pre9/include/asm-i386/fcntl.h 2002-10-06 14:57:21.000000000 -0400
+++ linux/include/asm-i386/fcntl.h 2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
#define O_LARGEFILE 0100000
#define O_DIRECTORY 0200000 /* must be a directory */
#define O_NOFOLLOW 0400000 /* don't follow links */
+#define O_STREAMING 04000000 /* streaming access */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-mips/fcntl.h linux/include/asm-mips/fcntl.h
--- linux-2.4.20-pre9/include/asm-mips/fcntl.h 2002-10-06 14:57:21.000000000 -0400
+++ linux/include/asm-mips/fcntl.h 2002-10-07 18:45:51.000000000 -0400
@@ -26,6 +26,7 @@
#define O_DIRECT 0x8000 /* direct disk access hint */
#define O_DIRECTORY 0x10000 /* must be a directory */
#define O_NOFOLLOW 0x20000 /* don't follow links */
+#define O_STREAMING 0x400000 /* streaming access */
#define O_NDELAY O_NONBLOCK
diff -urN linux-2.4.20-pre9/include/asm-ppc/fcntl.h linux/include/asm-ppc/fcntl.h
--- linux-2.4.20-pre9/include/asm-ppc/fcntl.h 2002-10-06 14:57:22.000000000 -0400
+++ linux/include/asm-ppc/fcntl.h 2002-10-07 18:45:51.000000000 -0400
@@ -23,6 +23,7 @@
#define O_NOFOLLOW 0100000 /* don't follow links */
#define O_LARGEFILE 0200000
#define O_DIRECT 0400000 /* direct disk access hint */
+#define O_STREAMING 04000000 /* streaming access */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
diff -urN linux-2.4.20-pre9/include/asm-sh/fcntl.h linux/include/asm-sh/fcntl.h
--- linux-2.4.20-pre9/include/asm-sh/fcntl.h 2002-10-06 14:57:27.000000000 -0400
+++ linux/include/asm-sh/fcntl.h 2002-10-07 18:45:51.000000000 -0400
@@ -20,6 +20,7 @@
#define O_LARGEFILE 0100000
#define O_DIRECTORY 0200000 /* must be a directory */
#define O_NOFOLLOW 0400000 /* don't follow links */
+#define O_STREAMING 04000000 /* streaming access */
#define F_DUPFD 0 /* dup */
#define F_GETFD 1 /* get close_on_exec */
diff -urN linux-2.4.20-pre9/mm/filemap.c linux/mm/filemap.c
--- linux-2.4.20-pre9/mm/filemap.c 2002-10-06 14:57:20.000000000 -0400
+++ linux/mm/filemap.c 2002-10-07 18:45:51.000000000 -0400
@@ -1322,6 +1322,90 @@
SetPageReferenced(page);
}
+/**
+ * shrink_list - non-blockingly drop pages from the given cache list
+ * @mapping: the mapping from which we want to drop pages
+ * @list: which list (e.g. locked, dirty, clean)?
+ * @max_index: greatest index from which we will drop pages
+ */
+static unsigned long shrink_list(struct address_space *mapping,
+ struct list_head *list,
+ unsigned long max_index)
+{
+ struct list_head *curr = list->prev;
+ unsigned long nr_shrunk = 0;
+
+ spin_lock(&pagemap_lru_lock);
+ spin_lock(&pagecache_lock);
+
+ while ((curr != list)) {
+ struct page *page = list_entry(curr, struct page, list);
+
+ curr = curr->prev;
+
+ if (page->index > max_index)
+ continue;
+
+ if (PageDirty(page))
+ continue;
+
+ if (TryLockPage(page))
+ break;
+
+ if (page->buffers && !try_to_release_page(page, 0)) {
+ /* probably dirty buffers */
+ unlock_page(page);
+ break;
+ }
+
+ if (page_count(page) != 1) {
+ unlock_page(page);
+ continue;
+ }
+
+ __lru_cache_del(page);
+ __remove_inode_page(page);
+ unlock_page(page);
+ page_cache_release(page);
+ nr_shrunk++;
+ }
+
+ spin_unlock(&pagecache_lock);
+ spin_unlock(&pagemap_lru_lock);
+
+ return nr_shrunk;
+}
+
+/**
+ * shrink_pagecache - nonblockingly drop pages from the mapping.
+ * @file: the file we are doing I/O on
+ * @max_index: the maximum index from which we are willing to drop pages
+ *
+ * This is for O_STREAMING, which says "I am streaming data, I know I will not
+ * revisit this; do not cache anything".
+ *
+ * max_index allows us to only drop pages which are behind `index', to avoid
+ * trashing readahead.
+ */
+static unsigned long shrink_pagecache(struct file *file,
+ unsigned long max_index)
+{
+ struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+ unsigned long nr_locked, nr_clean, nr_dirty;
+
+ /*
+ * ensure we have a decent amount of work todo
+ */
+ if (mapping->nrpages < 256)
+ return 0;
+
+ nr_locked = shrink_list(mapping, &mapping->locked_pages, max_index);
+ nr_clean = shrink_list(mapping, &mapping->clean_pages, max_index);
+ nr_dirty = shrink_list(mapping, &mapping->dirty_pages, max_index);
+
+ return nr_locked + nr_clean + nr_dirty;
+}
+
/*
* This is a generic file read routine, and uses the
* inode->i_op->readpage() function for the actual low-level
@@ -1538,6 +1622,8 @@
filp->f_reada = 1;
if (cached_page)
page_cache_release(cached_page);
+ if (filp->f_flags & O_STREAMING)
+ shrink_pagecache(filp, index);
UPDATE_ATIME(inode);
}
@@ -3047,6 +3133,9 @@
if (file->f_flags & O_DIRECT)
goto o_direct;
+ if (file->f_flags & O_STREAMING)
+ shrink_pagecache(file, pos >> PAGE_CACHE_SHIFT);
+
do {
unsigned long index, offset;
long page_fault;
Hi,
On 2002.10.08 Robert Love wrote:
>Attached patch implements an O_STREAMING file I/O flag which enables
>manual drop-behind of pages.
>
>If the file has O_STREAMING set then the user has explicitly said "this
>is streaming data, I know I will not revisit this, do not cache
>anything". So we drop pages from the pagecache before our current
>index. We have to fiddle a bit to get writes working since we do
>write-behind but the logic is there and it works.
>
Sorry if this is a newbie question, but, does glibc pass flags blindly
to the syscal ?? Ie, I do not need to rebuild glibc to use this in
open(), fcntl() and so on, just I can make sure that bit 04000000
is set in the flags.
TIA
--
J.A. Magallon <[email protected]> \ Software is like sex:
werewolf.able.es \ It's better when it's free
Mandrake Linux release 9.0 (dolphin) for i586
Linux 2.4.20-pre9-jam1 (gcc 3.2 (Mandrake Linux 9.0 3.2-1mdk))
On Tue, 2002-10-08 at 06:42, J.A. Magallon wrote:
> Sorry if this is a newbie question, but, does glibc pass flags blindly
> to the syscal ?? Ie, I do not need to rebuild glibc to use this in
> open(), fcntl() and so on, just I can make sure that bit 04000000
> is set in the flags.
Right. Do something like:
#define O_STREAMING 04000000
fd = open(file, ... | O_STREAMING);
or open it via fopen() and use fcntl() to set O_STREAMING.
Robert Love
On Mon, Oct 07, 2002 at 10:38:55PM -0400, Robert Love wrote:
> Attached patch implements an O_STREAMING file I/O flag which enables
> manual drop-behind of pages.
When/why would you use this instead of O_DIRECT?
--cw
On Tue, 2002-10-08 at 14:38, Chris Wedgwood wrote:
> > Attached patch implements an O_STREAMING file I/O flag which enables
> > manual drop-behind of pages.
I answered this in a previous email to this list:
In a lot of ways. This flag changes no semantics except to not let
pages from the mapping populate the page cache for very long.
In other words, this flag pretty much disables the pagecache for
this mapping, although we happily keep it around for write-behind
and read-ahead. But once the data is behind us and safe to kill, we
do. It is manual drop-behind.
O_DIRECT has a lot of semantics, one of which is to attempt to
minimize cache effects. It is also synchronous, requires properly
aligned buffers, and pretty much minimizes interaction with as much
of the kernel as possible. I am not overly familiar with its uses,
but I always assumed the big user is applications that implement
their own caching layer.
O_STREAMING would be for your TiVo or network audio streamer. Any
file I/O that is inherently sequential and access-once. No point
trashing the pagecache with its data - but otherwise the behavior is
normal.
Basically, with O_STREAMING you want normal semantics except drop-behind
of the pages. You even still want the pagecache caching your data -
just the not-yet-written write-behind data and the not-yet-read
read-ahead data.
With O_DIRECT you get a whole different can-of-worms. Basically you cut
out a lot of the kernel. You can do normal libc file I/O on an
O_STREAMING file with no semantic changes; except the drop-behind of the
pages.
Robert Love
Playing the devil's advocate here... I didn't see this earlier (when
was it discussed, I can't see it looking back either), so sorry if
this sounds circular or I'm going over stuff that has been discussed
before... but...
On Tue, Oct 08, 2002 at 02:49:09PM -0400, Robert Love wrote:
> In other words, this flag pretty much disables the pagecache for
> this mapping, although we happily keep it around for write-behind
> and read-ahead. But once the data is behind us and safe to kill, we
> do. It is manual drop-behind.
OK. What might use this though? What applications might want to
disable the page-cache but still use write-behind?
> O_DIRECT has a lot of semantics, one of which is to attempt to
> minimize cache effects.
It depends on the OS. Some OS are broken and treat O_DIRECT as a
hint, Linux and IRIX know it's a *requirement*.
> O_STREAMING would be for your TiVo or network audio streamer. Any
> file I/O that is inherently sequential and access-once. No point
> trashing the pagecache with its data - but otherwise the behavior is
> normal.
Actually, this sounds perfect for O_DIRECT. But I don't know much
about streaming video.
Since you only want the data once, why use the page-cache at all and
needlessly copy? Certainly, the requirements for O_DIRECT are not
that hard to meet or implement.
Don't get me wrong, I'm not saying this is a bad thing at all. The
patch is small and elegant so it's hard to object; I'm just trying to
understand where in practice I would use this over O_DIRECT.
--cw
Robert Love wrote:
>
> ...
>
> Andrew, any experience on one vs. the other?
I'd say that if you were designing a new application which
streams large amount of data then yes, you would design it
to use O_DIRECT. You would instantiate a separate IO worker
thread and a message passing mechanism so that thread would
pump your data for you, and would peform your readahead, etc.
If your filesystem supports O_DIRECT, of course. Not all do.
The strength of O_STREAMING is that you can take an existing,
working, megahuge application and make it play better with the
VM by changing a single line of code. No big redesign needed.
On Tue, Oct 08, 2002 at 12:05:13PM -0700, Chris Wedgwood wrote:
> Playing the devil's advocate here... I didn't see this earlier (when
> was it discussed, I can't see it looking back either), so sorry if
> this sounds circular or I'm going over stuff that has been discussed
> before... but...
>
>
> On Tue, Oct 08, 2002 at 02:49:09PM -0400, Robert Love wrote:
>
> > In other words, this flag pretty much disables the pagecache for
> > this mapping, although we happily keep it around for write-behind
> > and read-ahead. But once the data is behind us and safe to kill, we
> > do. It is manual drop-behind.
>
> OK. What might use this though? What applications might want to
> disable the page-cache but still use write-behind?
mkisofs?
Or do you have a machine with 5-6 GB of RAM to cache the content of a
DVD-image?
I only have 3 GB of RAM, and creating and writing trashes the whole
cache twice.
Bis denn
--
Real Programmers consider "what you see is what you get" to be just as
bad a concept in Text Editors as it is in women. No, the Real Programmer
wants a "you asked for it, you got it" text editor -- complicated,
cryptic, powerful, unforgiving, dangerous.
On Tue, Oct 08, 2002 at 03:17:16PM -0400, Robert Love wrote:
> Yep. Linux treats most "hints" (e.g. madvise) as a requirement - it
> fails if it cannot do it. That is against the spec most of the
> time, but oh well...
There is no spec for O_DIRECT... SGI 'invented' this in '93 or perhaps
earlier (but the idea wasn't new) for IRIX.
O_DIRECT is a very special thing, you shouldn't ask for this unless
yoy know you want it and how to deal with it --- treating it as
anything less that a requirement is bogus IMO.
> Shrug. I do not have much experience with O_DIRECT. I suspect the
> synchronous nature and the requirement of aligned buffers is not
> ideal.
I'm not sure how being synchornous matters if you use a different
thread (perhaps it's a pain), this also allows your own user-space
code to implement read-ahead? Buffer alignment issues in practice
really aren't that bad.
If someone can think of a meaningful benchmark that would be cool; if
not, then I'll hack up the code I wrote to stream DVD vobs about and
see how that compares.
--cw
On Tue, 2002-10-08 at 15:52, Chris Wedgwood wrote:
> On Tue, Oct 08, 2002 at 03:17:16PM -0400, Robert Love wrote:
>
> > Yep. Linux treats most "hints" (e.g. madvise) as a requirement - it
> > fails if it cannot do it. That is against the spec most of the
> > time, but oh well...
>
> There is no spec for O_DIRECT... SGI 'invented' this in '93 or perhaps
> earlier (but the idea wasn't new) for IRIX.
I was speaking more of madvise() and in general. I know O_DIRECT does
not have a spec. In general, Linux returns failures on things that many
other operating systems just consider hints (i.e. madvise()).
> O_DIRECT is a very special thing, you shouldn't ask for this unless
> yoy know you want it and how to deal with it --- treating it as
> anything less that a requirement is bogus IMO.
Agreed. Partly why O_STREAMING is needed.
Remember not everything implements O_DIRECT, especially not some odd
device you are streaming into/out of.
I think Andrew summed it up: if O_DIRECT will work in your environment,
and you can rewrite your application, it is probably preferred.
O_STREAMING is a simple solution to solve the pagecache waste which
requires one change to the application.
I did not intend for this to be an O_DIRECT vs. O_STREAMING thread. A
lot of people agree we need something like O_STREAMING - despite never
being implemented, you can find its name referenced often in archives
via google. A much more interesting argument is whether we should not
have an explicit O_STREAMING but instead an intelligent drop-behind
heuristic... but that is a 2.5 issue and the patch is for 2.4.
20% increase in kernel compilation is amazingly nice, for free.
Robert Love
On Tue, Oct 08, 2002 at 09:53:32PM +0200, Matthias Schniedermeyer wrote:
mkisofs?
O_DIRECT would probably win here too I think.
> I only have 3 GB of RAM, and creating and writing trashes the whole
> cache twice.
With 512MB of RAM, I stream (in the background while I'm poking about
under X with Mozilla and things) 10GB+ files around all the time and
never notice it, this is using O_DIRECT off XFS.
--cw
Matthias Schniedermeyer wrote:
>
> ...
> I only have 3 GB of RAM, and creating and writing trashes the whole
> cache twice.
That's actually something completely dumb and irritating which
Linux has done for ever ;)
What we need is to detect the situation where someone is linearly
walking through a file which is preposterously too large to cache,
and just start dropping it.
It's not hard to implement the lower machinery to do that - it would
basically be an internal call to posix_fadvise(), which we don't
have but could and perhaps should...
The tricky part is designing the algorithm which decides when to
pull the trigger.
On Tue, Oct 08, 2002 at 03:59:50PM -0400, Robert Love wrote:
> 20% increase in kernel compilation is amazingly nice, for free.
Results like this make pretty good arguments :)
--cw
On Tue, 2002-10-08 at 15:05, Chris Wedgwood wrote:
> > In other words, this flag pretty much disables the pagecache for
> > this mapping, although we happily keep it around for write-behind
> > and read-ahead. But once the data is behind us and safe to kill, we
> > do. It is manual drop-behind.
>
> OK. What might use this though? What applications might want to
> disable the page-cache but still use write-behind?
Streaming I/O wants read-ahead. Filesystems themselves implement the
write-behind and we do not want to circumvent so much of the kernel.
The point of O_STREAMING is one change: drop pages in the pagecache
behind our current position, that are free-able, because we know we will
never want them. Its a hint from the application saying "I will never
revisit this so dump it".
O_DIRECT is a much bigger can of worms. You lose a lot of what the
kernel provides. You have to do things in block-sized chunks. Etc.
etc.
> > O_DIRECT has a lot of semantics, one of which is to attempt to
> > minimize cache effects.
>
> It depends on the OS. Some OS are broken and treat O_DIRECT as a
> hint, Linux and IRIX know it's a *requirement*.
Yep. Linux treats most "hints" (e.g. madvise) as a requirement - it
fails if it cannot do it. That is against the spec most of the time,
but oh well...
> > O_STREAMING would be for your TiVo or network audio streamer. Any
> > file I/O that is inherently sequential and access-once. No point
> > trashing the pagecache with its data - but otherwise the behavior is
> > normal.
>
> Actually, this sounds perfect for O_DIRECT. But I don't know much
> about streaming video.
>
> Since you only want the data once, why use the page-cache at all and
> needlessly copy? Certainly, the requirements for O_DIRECT are not
> that hard to meet or implement.
>
> Don't get me wrong, I'm not saying this is a bad thing at all. The
> patch is small and elegant so it's hard to object; I'm just trying to
> understand where in practice I would use this over O_DIRECT.
Shrug. I do not have much experience with O_DIRECT. I suspect the
synchronous nature and the requirement of aligned buffers is not ideal.
With O_STREAMING you can simply set the flag and use your normal I/O and
normal interfaces and have a field day.
Andrew, any experience on one vs. the other?
Robert Love
On Tue, Oct 08, 2002 at 01:03:11PM -0700, Andrew Morton wrote:
> Matthias Schniedermeyer wrote:
> >
> > ...
> > I only have 3 GB of RAM, and creating and writing trashes the whole
> > cache twice.
>
> That's actually something completely dumb and irritating which
> Linux has done for ever ;)
>
> What we need is to detect the situation where someone is linearly
> walking through a file which is preposterously too large to cache,
> and just start dropping it.
>
> It's not hard to implement the lower machinery to do that - it would
> basically be an internal call to posix_fadvise(), which we don't
> have but could and perhaps should...
>
> The tricky part is designing the algorithm which decides when to
> pull the trigger.
I have more of this cases.
I use a program called VDR. This is for recording digital-TV-program
from satallite.
After a recording is finished i cut the recordings. I my case i "stream"
the input-data via NFS from the recording machine(s) through a converter
into the local temporary directory. After i have enough files i create
ISO-images of the files. When i create an ISO-images i "stream" the
files from HDD1 to HDD2 because otherwise it would completly kill the
performance. Then i burn the ISO-Image onto a DVD-R.
Every single part in the whole process trashes the cache.
Bis denn
--
Real Programmers consider "what you see is what you get" to be just as
bad a concept in Text Editors as it is in women. No, the Real Programmer
wants a "you asked for it, you got it" text editor -- complicated,
cryptic, powerful, unforgiving, dangerous.
> What we need is to detect the situation where someone is linearly
> walking through a file which is preposterously too large to cache,
> and just start dropping it.
...
> The tricky part is designing the algorithm which decides when to
> pull the trigger.
I did a variation of this in SunOS years ago. It did not have the
"big file" wrinkle you are suggesting, it worked like
if (we are sequential AND
we are running low on memory AND
file size > 256K) {
invalidate the pages behind me
}
You use the same data structures which turn on read ahead to mean
sequential access, that's obvious.
What this didn't fix was when you read a monstor file into memory and
then didn't do anything with it. That would fill the page cache and
the above alg would keep you from thrashing the machine but didn't
flush the stale memory.
If I were to do it again, I'd maintain stats in the inode about
access pattern, # of pages in ram for the inode, time of last I/O,
time of last page fault or read. Then when memory is getting tight
you do a
foreach i (ALL INODES) {
unless (i.accesspat == SEQ) continue;
unless ((now() - i.pagefault) > STALE_TIME) continue;
unless ((now() - i.io) > STALE_TIME) continue;
flush_pages();
}
You want to be a lot more clever than that because you'd like to have
fudging in favor of the clean pages vs dirty pages, you more or less
end up wanting to go through the loop more than once, getting more and
more eager as you are more and more desparate for ram.
--
---
Larry McVoy lm at bitmover.com http://www.bitmover.com/lm
Matthias Schniedermeyer wrote:
>
> ...
> I use a program called VDR. This is for recording digital-TV-program
> from satallite.
>
> After a recording is finished i cut the recordings. I my case i "stream"
> the input-data via NFS from the recording machine(s) through a converter
> into the local temporary directory. After i have enough files i create
> ISO-images of the files. When i create an ISO-images i "stream" the
> files from HDD1 to HDD2 because otherwise it would completly kill the
> performance. Then i burn the ISO-Image onto a DVD-R.
>
> Every single part in the whole process trashes the cache.
Right. You dont have O_DIRECT for NFS and you control the
application. You need O_STREAMING. Or posix_fadvise(), which
would be significantly harder to use and is not really implementable
in 2.4.
Any magical kernel voodoo which reads your mind and drops that
cache early would probably help, but there's no way in which it
can be as effective as an explicit hint.
> The point of O_STREAMING is one change: drop pages in the pagecache
> behind our current position, that are free-able, because we know we will
> never want them.
Does it drop pages unconditionally ? What happens if I do a
streaming_cat largedatabase > /dev/null while other processes
are working on it ? It's not a good thing to remove the whole
cached data other apps are working on.
Bye.
Giuliano Pochini wrote:
>
> > The point of O_STREAMING is one change: drop pages in the pagecache
> > behind our current position, that are free-able, because we know we will
> > never want them.
>
> Does it drop pages unconditionally ?
Yup.
> What happens if I do a
> streaming_cat largedatabase > /dev/null while other processes
> are working on it ?
You'll make your database run really slowly.
> It's not a good thing to remove the whole
> cached data other apps are working on.
>
Don't do that then ;)
Seriously, there are tons of ways of creating local performance
DoS'es of this form. fsync is an excellent tool for that.
On 09-Oct-2002 Andrew Morton wrote:
> Giuliano Pochini wrote:
>>
>> > The point of O_STREAMING is one change: drop pages in the pagecache
>> > behind our current position, that are free-able, because we know we will
>> > never want them.
>>
>> Does it drop pages unconditionally ?
>
> Yup.
>
>> What happens if I do a
>> streaming_cat largedatabase > /dev/null while other processes
>> are working on it ?
>
> You'll make your database run really slowly.
>
>> It's not a good thing to remove the whole
>> cached data other apps are working on.
>
> Don't do that then ;)
I was thinking about hot backups of databases. But even if it
did not drop caches "shared" by other processes it would drop
them anyway because write-behind is still on. Probably only
O_DIRECT can help in this case.
> Seriously, there are tons of ways of creating local performance
> DoS'es of this form. fsync is an excellent tool for that.
Yes, I'm aware of that.
Bye.
On Tuesday 08 October 2002 04:38, Robert Love wrote:
> Attached patch implements an O_STREAMING file I/O flag which enables
> manual drop-behind of pages.
>
> If the file has O_STREAMING set then the user has explicitly said "this
> is streaming data, I know I will not revisit this, do not cache
> anything". So we drop pages from the pagecache before our current
> index. We have to fiddle a bit to get writes working since we do
> write-behind but the logic is there and it works.
Great ;-)
This is the nice way of doing what the akpm-patch did for me a while ago.
roy
--
Roy Sigurd Karlsbakk, Datavaktmester
ProntoTV AS - http://www.pronto.tv/
Tel: +47 9801 3356
Computers are like air conditioners.
They stop working when you open Windows.
On 7 Oct 2002, Robert Love wrote:
> Attached patch implements an O_STREAMING file I/O flag which enables
> manual drop-behind of pages.
[...]
> diff -urN linux-2.4.20-pre9/include/asm-i386/fcntl.h linux/include/asm-i386/fcntl.h
> --- linux-2.4.20-pre9/include/asm-i386/fcntl.h 2002-10-06 14:57:21.000000000 -0400
> +++ linux/include/asm-i386/fcntl.h 2002-10-07 18:45:51.000000000 -0400
> @@ -20,6 +20,7 @@
> #define O_LARGEFILE 0100000
> #define O_DIRECTORY 0200000 /* must be a directory */
> #define O_NOFOLLOW 0400000 /* don't follow links */
> +#define O_STREAMING 04000000 /* streaming access */
^^^^^^^^^^^^^
>
> #define F_DUPFD 0 /* dup */
> #define F_GETFD 1 /* get close_on_exec */
> diff -urN linux-2.4.20-pre9/include/asm-mips/fcntl.h linux/include/asm-mips/fcntl.h
> --- linux-2.4.20-pre9/include/asm-mips/fcntl.h 2002-10-06 14:57:21.000000000 -0400
> +++ linux/include/asm-mips/fcntl.h 2002-10-07 18:45:51.000000000 -0400
> @@ -26,6 +26,7 @@
> #define O_DIRECT 0x8000 /* direct disk access hint */
> #define O_DIRECTORY 0x10000 /* must be a directory */
> #define O_NOFOLLOW 0x20000 /* don't follow links */
> +#define O_STREAMING 0x400000 /* streaming access */
^^^^^^^^^^^^^
>
> #define O_NDELAY O_NONBLOCK
04000000 != 0x400000
or am I missing something?
(do different archs dream of different O_STREAMING values?)
.TM.
--
____/ ____/ /
/ / / Marco Colombo
___/ ___ / / Technical Manager
/ / / ESI s.r.l.
_____/ _____/ _/ [email protected]
On Wed, 2002-10-09 at 10:10, Marco Colombo wrote:
> > #define O_NOFOLLOW 0400000 /* don't follow links */
> > #define O_NOFOLLOW 0x20000 /* don't follow links */
> ...
> 04000000 != 0x400000
>
> or am I missing something?
No need. See for example O_NOFOLLOW right above. Each architecture can
do has it pleases (I wish otherwise, but...).
> (do different archs dream of different O_STREAMING values?)
If they so choose. Just look at the formats of the two numbers you
posted, even those are different.
Robert Love
On Tue, 8 Oct 2002, Andrew Morton wrote:
> I'd say that if you were designing a new application which
> streams large amount of data then yes, you would design it
> to use O_DIRECT. You would instantiate a separate IO worker
> thread and a message passing mechanism so that thread would
> pump your data for you, and would peform your readahead, etc.
>
> If your filesystem supports O_DIRECT, of course. Not all do.
>
> The strength of O_STREAMING is that you can take an existing,
> working, megahuge application and make it play better with the
> VM by changing a single line of code. No big redesign needed.
Such as perl:
sysopen(MYKERNEL, "/boot/vmlinuz", 04000000);
O_DIRECT support is another beast, IMHO.
.TM.
On 9 Oct 2002, Robert Love wrote:
> On Wed, 2002-10-09 at 10:10, Marco Colombo wrote:
>
> > > #define O_NOFOLLOW 0400000 /* don't follow links */
> > > #define O_NOFOLLOW 0x20000 /* don't follow links */
Hmm. It's been a long time since I had to use octal. Since 0400000
the exact same value as 0x20000, why not use 0x20000? It's much
more common notation.
Cheers,
Dick Johnson
Penguin : Linux version 2.4.18 on an i686 machine (797.90 BogoMips).
The US military has given us many words, FUBAR, SNAFU, now ENRON.
Yes, top management were graduates of West Point and Annapolis.
On Oct 09, 2002 10:14 -0400, Robert Love wrote:
> On Wed, 2002-10-09 at 10:10, Marco Colombo wrote:
>
> > > #define O_NOFOLLOW 0400000 /* don't follow links */
> > > #define O_NOFOLLOW 0x20000 /* don't follow links */
> > ...
> > 04000000 != 0x400000
> >
> > or am I missing something?
>
> No need. See for example O_NOFOLLOW right above. Each architecture can
> do has it pleases (I wish otherwise, but...).
>
> > (do different archs dream of different O_STREAMING values?)
>
> If they so choose. Just look at the formats of the two numbers you
> posted, even those are different.
I would say - if you are picking a new flag that doesn't need to have
compatibility with any platform-specific existing flag, simply set them
all high enough so that they are the same on all platforms. Just
because some of the flags are broken is no need to make all of them so.
Cheers, Andreas
--
Andreas Dilger
http://www-mddsp.enel.ucalgary.ca/People/adilger/
http://sourceforge.net/projects/ext2resize/
>
> Robert Love wrote:
> >
> > ...
> >
> > Andrew, any experience on one vs. the other?
>
> I'd say that if you were designing a new application which
> streams large amount of data then yes, you would design it
> to use O_DIRECT. You would instantiate a separate IO worker
> thread and a message passing mechanism so that thread would
> pump your data for you, and would peform your readahead, etc.
>
> If your filesystem supports O_DIRECT, of course. Not all do.
>
> The strength of O_STREAMING is that you can take an existing,
> working, megahuge application and make it play better with the
> VM by changing a single line of code. No big redesign needed.
I think it will be a really big win on quite low speed machines streaming
at eg set top box speeds, without much memory (eg your Tivo).
O_DIRECT needs at least 4M reads to overcome lack of readahead,
so is only really useful if you are doing really big stuff (eg
uncompressed video). It is possible that O_STREAMING will be just as
good as this however, becasue there are latency issues with O_DIRECT
(you really need aio or multiple threads to sustain good readahead,
one test of mine needed 8 threads for optimal performance). I have
a bunch of things I could retest with O_STREAMING and large readahead.
O_DIRECT is always going to win for random access stuff though. It is
good to have a choice.
Justin
On Wed, Oct 09, 2002 at 10:33:25AM +0200, Giuliano Pochini wrote:
> > The point of O_STREAMING is one change: drop pages in the pagecache
> > behind our current position, that are free-able, because we know we will
> > never want them.
> Does it drop pages unconditionally ? What happens if I do a
> streaming_cat largedatabase > /dev/null while other processes
> are working on it ? It's not a good thing to remove the whole
> cached data other apps are working on.
Anybody could make the cache thrash. I don't see this as an argument against
O_STREAMING (whether explicitly activated, or dynamically activated).
The only extension I would suggest (I don't think the patch did this?)
is that pages should only be candidates for being forgotten if all
open files associated with the page are O_STREAMING and all seek
points for all open files are beyond the page.
This would allow for a web app, or similar, that was serving the same
document over two different sockets, to provide a compromise between
O_STREAMING and not O_STREAMING where performance would suffer, but
for the common case, where only one person is accessing the file, the
full benefit of O_STREAMING would be realized.
Does the patch allow for mmap() to benefit from O_STREAMING?
"I intend to access this virtual memory range sequentially..."
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
> > Does it drop pages unconditionally ? What happens if I do a
> > streaming_cat largedatabase > /dev/null while other processes
> > are working on it ? It's not a good thing to remove the whole
> > cached data other apps are working on.
>
> Anybody could make the cache thrash. I don't see this as an argument against
> O_STREAMING (whether explicitly activated, or dynamically activated).
In fact it isn't. But I don't undestand why we unconditionally discard a
page after it has been read. Yes, I told the kernel I will not need it
anymore, but someone else could need it. I'm not a kernel hacker and I
don't know if this is possible: when a page is read from disk by a O_STR
file flag it "kill me first when needed, otherwise leave me in memory",
and if a page is already cache, just use it and change nothing. This
will preserve data used by other processes, and the data I've just
read if there is room. Free memory is wasted momory. Don't drop caches
if nobody need memory.
Bye.
On Wed, Oct 09, 2002 at 09:36:11PM +0200, Giuliano Pochini wrote:
> In fact it isn't. But I don't undestand why we unconditionally discard a
> page after it has been read. Yes, I told the kernel I will not need it
> anymore, but someone else could need it. I'm not a kernel hacker and I
> don't know if this is possible: when a page is read from disk by a O_STR
> file flag it "kill me first when needed, otherwise leave me in memory",
> and if a page is already cache, just use it and change nothing. This
> will preserve data used by other processes, and the data I've just
> read if there is room. Free memory is wasted momory. Don't drop caches
> if nobody need memory.
If the patch were to be modified to include the following, you would not
have an issue with it:
1) Pages should not be candidates for dropping if an open file has
a seek offset pointing to an earlier page unless the seek offset
is many pages away. (I'm not sure what the best way of defining
'many' is... free memory? disk access times? experimentation?)
2) Pages should not be candidates for dropping if the pages belong
to the first few pages of a file. (First = 2? 4? 8?) The theory
being, that somebody could begin reading the file again from the
beginning.
With this in mind, dynamic detection becomes a lot easier and less
error prone. The simplest way of detecting a file that would benefit
from having pages dropped is to keep a flag that indicates whether the
file was *ever* read non-sequentially. If a file was never read
non-sequentially, pages that are not at the beginning of the file, and
pages that are earlier than all seek points for the file, or that are
many pages later than all earlier seek points for the file may be
safely dropped. In fact, I am surprised this is not implemented
already. :-)
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
Mark Mielke wrote:
> 2) Pages should not be candidates for dropping if the pages belong
> to the first few pages of a file. (First = 2? 4? 8?) The theory
> being, that somebody could begin reading the file again from the
> beginning.
This breaks the benefit of using O_STREAMING to read a lot of small
files once, as you might do when grepping the kernel tree for example.
-- Jamie
Andreas Dilger wrote:
> I would say - if you are picking a new flag that doesn't need to have
> compatibility with any platform-specific existing flag, simply set them
> all high enough so that they are the same on all platforms. Just
> because some of the flags are broken is no need to make all of them so.
Agreed! It would have been nice to do this earlier as there are a few
flags in this category. Oh well.
-- Jamie
On Wed, 9 Oct 2002, Andreas Dilger wrote:
> On Oct 09, 2002 10:14 -0400, Robert Love wrote:
> > On Wed, 2002-10-09 at 10:10, Marco Colombo wrote:
> >
> > > > #define O_NOFOLLOW 0400000 /* don't follow links */
> > > > #define O_NOFOLLOW 0x20000 /* don't follow links */
> >
> > No need. See for example O_NOFOLLOW right above. Each architecture can
> > do has it pleases (I wish otherwise, but...).
>
> I would say - if you are picking a new flag that doesn't need to have
> compatibility with any platform-specific existing flag, simply set them
> all high enough so that they are the same on all platforms.
Doesn't really matter, you can't run x86 binaries on MIPS so
you need to recompile anyway.
Source level compatibility is enough for flags like this.
regards,
Rik
--
Bravely reimplemented by the knights who say "NIH".
http://www.surriel.com/ http://distro.conectiva.com/
Current spamtrap: <a href=mailto:"[email protected]">[email protected]</a>
Rik van Riel wrote:
> > I would say - if you are picking a new flag that doesn't need to have
> > compatibility with any platform-specific existing flag, simply set them
> > all high enough so that they are the same on all platforms.
>
> Doesn't really matter, you can't run x86 binaries on MIPS so
> you need to recompile anyway.
>
> Source level compatibility is enough for flags like this.
Using the _same_ flag on different architectures can simplify the
kernel source, though. Just imagine, a set of O_* definitions in
<linux/fcntl.h> instead of them being duplicated, with different
definitions, throughout <asm-*/fcntl.h>.
-- Jamie
On Thu Oct 10, 2002 at 01:16:41AM +0100, Jamie Lokier wrote:
> Using the _same_ flag on different architectures can simplify the
> kernel source, though. Just imagine, a set of O_* definitions in
> <linux/fcntl.h> instead of them being duplicated, with different
> definitions, throughout <asm-*/fcntl.h>.
That would be wonderful -- except those asm-*/fcntl.h values are
also duplicated in arch specific include/bits/fcntl.h files in
glibc, uClibc, etc and are compiled into zillions of existing
binaries. Change it and you break binary compatibility...
So if your going to have a flag day, you will need to coordinate
that change with a bunch of non-kernel people as well.
-Erik
--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
On Thu, Oct 10, 2002 at 12:20:02AM +0100, Jamie Lokier wrote:
> Mark Mielke wrote:
> > 2) Pages should not be candidates for dropping if the pages belong
> > to the first few pages of a file. (First = 2? 4? 8?) The theory
> > being, that somebody could begin reading the file again from the
> > beginning.
> This breaks the benefit of using O_STREAMING to read a lot of small
> files once, as you might do when grepping the kernel tree for example.
It doesn't break it. It reduces it to current speeds.
I might be wrong, but it seems to me that O_STREAMING isn't the answer
to everything. The primary benefactors of O_STREAMING would be
applications that read very large files that do not fit into RAM, from
start to finish.
If you want to improve grepping the kernel tree, the answer lies in
improving the standard scheme, not overloading the specialized
O_STREAMING scheme.
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
On Thu Oct 10, 2002 at 12:20:02AM +0100, Jamie Lokier wrote:
> Mark Mielke wrote:
> > 2) Pages should not be candidates for dropping if the pages belong
> > to the first few pages of a file. (First = 2? 4? 8?) The theory
> > being, that somebody could begin reading the file again from the
> > beginning.
>
> This breaks the benefit of using O_STREAMING to read a lot of small
> files once, as you might do when grepping the kernel tree for example.
I don't think grep is a very good candidate for O_STREAMING. I
usually want the stuff I grep to stay in cache. O_STREAMING is
much better suited to applications like ogle, vlc, xine, xmovie,
xmms etc since there is little reason for the OS to cache things
like songs and movies you aren't likely to hear/see again any
time soon.
-Erik
--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
On Wed, 2002-10-09 at 23:29, Erik Andersen wrote:
> I don't think grep is a very good candidate for O_STREAMING. I
> usually want the stuff I grep to stay in cache. O_STREAMING is
> much better suited to applications like ogle, vlc, xine, xmovie,
> xmms etc since there is little reason for the OS to cache things
> like songs and movies you aren't likely to hear/see again any
> time soon.
Yes. Good point. People are taking this too far. There is a big
difference between being just sequential and use-once. Grep(1) is a
great example of something that _should_ use the pagecache. Subsequent
file accesses, which will occur, should hit.
Look, the pagecache is already smart. New stuff will replace unusued
old stuff. On VM pressure, the pagecache will be pruned. Streaming I/O
is a fundamentally different problem in that the data is so large it
_continually_ thrashes the pagecache. Such I/O is sequential and
use-once. You end up with a permanent waste of memory (the cached
I/O).
Let's prove we have a solution to this problem before going after
tangent ones.
Robert Love
On 10-Oct-2002 Erik Andersen wrote:
> I don't think grep is a very good candidate for O_STREAMING. I
> usually want the stuff I grep to stay in cache. O_STREAMING is
> much better suited to applications like ogle, vlc, xine, xmovie,
> xmms etc since there is little reason for the OS to cache things
> like songs and movies you aren't likely to hear/see again any
> time soon.
The kernel already have cache pruning algorithm. O_STREAMING logic
should not clear caches if there is no need to do that. We could
fake the age of the pages loaded with O_STR to make the kernel
discard them earlier (oh, I SUPPOSE pages have an age to make
a lru replacement algorithm possible).
Bye.
On Thu Oct 10, 2002 at 10:33:36AM +0200, Giuliano Pochini wrote:
>
> On 10-Oct-2002 Erik Andersen wrote:
> > I don't think grep is a very good candidate for O_STREAMING. I
> > usually want the stuff I grep to stay in cache. O_STREAMING is
> > much better suited to applications like ogle, vlc, xine, xmovie,
> > xmms etc since there is little reason for the OS to cache things
> > like songs and movies you aren't likely to hear/see again any
> > time soon.
>
> The kernel already have cache pruning algorithm. O_STREAMING logic
> should not clear caches if there is no need to do that. We could
The entire point of O_STREAMING is to let user space specify
policy. If user space user space knows with 100% certainty that
the data being read/written from a particular file descriptor is
use-once-and-discard data, then it makes sense to honor that
hint. In this case, user space knows best and can set policy on
a per file descriptor basis.
Note that most applications do not want to use this flag. But
for a few applications it just just perfect. For example, if I
am playing a DVD there is absolutely no point in the kernel
trying to cache the content of the DVD. A DVD has way too much
content for caching it to do any good, and since most people
watch a DVD once through from beginning to end, there is no point
stuffing the DVD's content into the pagecache, thereby crowding
out other things that should remain in cache.
-Erik
--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
On Wed, 9 Oct 2002, Mark Mielke wrote:
> With this in mind, dynamic detection becomes a lot easier and less
> error prone. The simplest way of detecting a file that would benefit
> from having pages dropped is to keep a flag that indicates whether the
> file was *ever* read non-sequentially. If a file was never read
> non-sequentially, pages that are not at the beginning of the file, and
> pages that are earlier than all seek points for the file, or that are
> many pages later than all earlier seek points for the file may be
> safely dropped. In fact, I am surprised this is not implemented
> already. :-)
there is a serious problem with this heristic, if you have a large video
file that you one time skip into the center of, or rewind slightly that
fill will then be flagged as not being suitable for O_STREAMING.
it doesn't matter how other programs have used this file in the past, what
matters is how this file is being used now.
in your case of two webserver processes trying to access the same file
both with O_STRAMING, let them, the first process starts reading things
and flags the old blocks as 'discard immediatly' (would it possibly be
worth adding another FIFO list to the VM to contain 'these pages can be
thrown out of memory instantly with no additional work needed' pages??, or
since I think there is one for freeable cache make a second for 'low
priority' freeable pages such that you throw away all 'low priority' pages
before you touch the normal freeable pages), the second process would then
start reading the same file and the readahead for it would attempt to read
the file in, and the system should be able to notice that these pages are
already in memory (in the low priority freeable list) and rescue them up
to the normal list.
no need to have special code to try and decide if the various pointers are
'close enough' togeather, the readahead size does that for you (and since
readahead can be tuned as memory preasure grows this means that the
definition of 'close enough' will change dynamicly)
David Lang
>> The kernel already have cache pruning algorithm. O_STREAMING logic
>> should not clear caches if there is no need to do that. We could
>
> The entire point of O_STREAMING is to let user space specify
> policy. If user space user space knows with 100% certainty that
> the data being read/written from a particular file descriptor is
> use-once-and-discard data, then it makes sense to honor that
> hint. In this case, user space knows best and can set policy on
> a per file descriptor basis.
Yes, it makes sense, but it's useless or harmful to discard caches
if nobody else needs memory. You just lose data that may be
requested in the future for no reason.
Bye.
On 9 October 2002 17:36, Giuliano Pochini wrote:
> > > Does it drop pages unconditionally ? What happens if I do a
> > > streaming_cat largedatabase > /dev/null while other processes
> > > are working on it ? It's not a good thing to remove the whole
> > > cached data other apps are working on.
> >
> > Anybody could make the cache thrash. I don't see this as an
> > argument against O_STREAMING (whether explicitly activated, or
> > dynamically activated).
>
> In fact it isn't. But I don't undestand why we unconditionally
> discard a page after it has been read. Yes, I told the kernel I will
> not need it anymore, but someone else could need it. I'm not a kernel
> hacker and I don't know if this is possible: when a page is read from
> disk by a O_STR file flag it "kill me first when needed, otherwise
> leave me in memory", and if a page is already cache, just use it and
> change nothing. This will preserve data used by other processes, and
> the data I've just read if there is room. Free memory is wasted
There is almost never room. Linux fills all memory with cache
pretty soon unless you have several gigs of RAM. This is good.
The question is, what to cache and what to drop.
Come on, do you really want to find all your caches washed out
after dinner if you left your box playing MP3s? Or after you
watched MPEG?
--
vda
On Wed, 9 Oct 2002, Rik van Riel wrote:
> On Wed, 9 Oct 2002, Andreas Dilger wrote:
> > On Oct 09, 2002 10:14 -0400, Robert Love wrote:
> > > On Wed, 2002-10-09 at 10:10, Marco Colombo wrote:
> > >
> > > > > #define O_NOFOLLOW 0400000 /* don't follow links */
> > > > > #define O_NOFOLLOW 0x20000 /* don't follow links */
> > >
> > > No need. See for example O_NOFOLLOW right above. Each architecture can
> > > do has it pleases (I wish otherwise, but...).
> >
> > I would say - if you are picking a new flag that doesn't need to have
> > compatibility with any platform-specific existing flag, simply set them
> > all high enough so that they are the same on all platforms.
>
> Doesn't really matter, you can't run x86 binaries on MIPS so
> you need to recompile anyway.
>
> Source level compatibility is enough for flags like this.
>
> regards,
>
> Rik
>
True, but either you include kernel headers from user apps, or wait for
glibc (or [whatever]libc) to catch up, or do something like this:
#define O_STREAMING 04000000
fd = open(file, ... | O_STREAMING);
(quoted directly from one of Robert's messages).
The latter is broken on MIPS, and requiring either glibc headers or the
programmer to handle different archs is unfortunate. One of the biggest
advantages of O_STREAMING is that is it's simple and elegant to integrate
it into existing apps: let's make it even easier by choosing the same
value, so that the above C is the right thing.
Besides, not all the world is C. I don't expect, say, Perl or Python to
support Linux O_STREAMING on their POSIX modules. Perl does pass flags to
open(2) untouched (I haven't tested Python yet), but right now I have to:
- wait for an official Perl update that supports O_STREAMING;
- test explicitly for different archs in my perl program in order
to choose the right O_STREAMING value (or hack system modules to do
the same);
- forget about portability on my Perl script (which is somewhat worse than
doing that same for a C program: one of the goals of using Perl *is*
portability).
Note that having different O_NOFOLLOW (or even O_CREAT) values is less
annoying, since I expect any language that allows me to pass flags to
open(2) (or fcntl(2)) to define those as macros (constants, subroutines
or whatever).
In the end, I see we can choose different values, but why should we?
.TM.
--
____/ ____/ /
/ / / Marco Colombo
___/ ___ / / Technical Manager
/ / / ESI s.r.l.
_____/ _____/ _/ [email protected]
In article <[email protected]>,
Giuliano Pochini <[email protected]> wrote:
>>> The kernel already have cache pruning algorithm. O_STREAMING logic
>>> should not clear caches if there is no need to do that. We could
>>
>> The entire point of O_STREAMING is to let user space specify
>> policy. If user space user space knows with 100% certainty that
>> the data being read/written from a particular file descriptor is
>> use-once-and-discard data, then it makes sense to honor that
>> hint. In this case, user space knows best and can set policy on
>> a per file descriptor basis.
>
>Yes, it makes sense, but it's useless or harmful to discard caches
>if nobody else needs memory. You just lose data that may be
>requested in the future for no reason.
But to cache the DVD you will have to throw out the data which
is already there for no reason, and that is exactly what you
want to avoid.
At least on my machine buffers/cache _always_ fill up all free
memory. I don't want the streaming DVD to push that out.
Mike.
Mark Mielke wrote:
> I might be wrong, but it seems to me that O_STREAMING isn't the answer
> to everything. The primary benefactors of O_STREAMING would be
> applications that read very large files that do not fit into RAM, from
> start to finish.
It don't have to be a file that don't fit into RAM. Remember, other
running apps wants memory and cache too, so the "fair share" of memory
for _this_ process is much smaller than all of RAM.
So, O_STREAMING makes sense for all files where we know that we're going
sequentially and that caching this for long won't help.
(Because the contents likely will be pushed out before we need
them again anyway (DVD case) or we know were going to delete
the file, or we simply don't want to push anything else
out even if we could cache this.)
Helge Hafting
Giuliano Pochini wrote:
> Yes, it makes sense, but it's useless or harmful to discard caches
> if nobody else needs memory. You just lose data that may be
> requested in the future for no reason.
Sure, so the ideal is to not drop unconditionally, but
make sure that the "finished" O_STREAMING pages are
the very first ones to go whenever memory pressure happens.
The question then becomes "can you do that, with no more
overhead or code complexity than the existing stuff?"
It wouldn't necessarily make much difference, because
a linux machine is almost always under memory pressure.
Free memory is simply filled up with cache till there
is no more left. From then on, all requests for memory
are handled by throwing something else out of cache
or into swap. In that case the streaming pages
are evicted quickly anyway, and the ideal case
is no different from the implemented case.
Helge Hafting
On Thu, 2002-10-10 at 04:29, Erik Andersen wrote:
> I don't think grep is a very good candidate for O_STREAMING. I
> usually want the stuff I grep to stay in cache. O_STREAMING is
> much better suited to applications like ogle, vlc, xine, xmovie,
> xmms etc since there is little reason for the OS to cache things
> like songs and movies you aren't likely to hear/see again any
> time soon.
Im not sure O_STREAMING is what you actually want here, its proper
working drop behind. That -shouldnt- need a magic flag if the kernel is
doing the VM things right.
For streaming media writes you want a thread (we lack aio_fsync it
seems), you do a regular asynchronous fsync to keep the buffering
smooth.
For streaming media read the kernel ought to be able to get it right,
and if not then I'd much rather the kernel gave _me_ total control
Instead of O_STREAMING therefore I'd much prefer to have
fadvise(filehandle, offset, length, FADV_DONTNEED);
Its quite possible that most of the rest of the madvise notions aren't
worth implementing, but we have the flexibility to do. The fadvise
interface also lets you pick which ranges you evict, so now I can do
streaming media but not fadvise out of cache key frames so that my
chapter starts just happen to generally be in cache as do a few I frames
behind the read pointer - (for rewind).
Do that with O_STREAMING ?
Alan
On Thu, Oct 10, 2002 at 12:38:52PM +0100, Alan Cox wrote:
> Im not sure O_STREAMING is what you actually want here, its proper
> working drop behind. That -shouldnt- need a magic flag if the kernel is
> doing the VM things right.
This is a somewhat painful issue. The negative effect of its absence on
UP is quite visible, but multiprogrammed streaming SMP workloads (i.e.
badari's 40 simultaneous dd's) appear to degrade severely in the
presence of the available drop behind implementations.
IIRC akpm suggested in response to the SEGQ/NRU patches that a method
of reducing the arrival rate to the relevant LRU locks for drop behind
would be required to remain performant on multiprogrammed streaming
workloads. This is not entirely simple as methods of deferred LRU list
manipulations are largely unclear, and Rik pointed out that marking the
affected pages for immediate deactivation on scanning like my attempt
at resolving this is a grossly ineffective method of actually
accomplishing drop behind.
On Thu, Oct 10, 2002 at 12:38:52PM +0100, Alan Cox wrote:
> Instead of O_STREAMING therefore I'd much prefer to have
> fadvise(filehandle, offset, length, FADV_DONTNEED);
This issue also arose in response to Rik's NRU/SEGQ patches.
Essentially his accounting was on a per-mm basis and I questioned
whether or not it should be done on a finer-grained level. No clear
response or confirming/conflicting opinion was ever posted.
On Thu, Oct 10, 2002 at 12:38:52PM +0100, Alan Cox wrote:
> Its quite possible that most of the rest of the madvise notions aren't
> worth implementing, but we have the flexibility to do. The fadvise
> interface also lets you pick which ranges you evict, so now I can do
> streaming media but not fadvise out of cache key frames so that my
> chapter starts just happen to generally be in cache as do a few I frames
> behind the read pointer - (for rewind).
> Do that with O_STREAMING ?
The level of control you propose is clearly much stronger than
O_STREAMING. Unfortunately I'm not in a position to comment on the need
for or the usefulness of the interface.
Bill
Le jeu 10/10/2002 ? 13:01, Helge Hafting a ?crit :
> Giuliano Pochini wrote:
>
> > Yes, it makes sense, but it's useless or harmful to discard caches
> > if nobody else needs memory. You just lose data that may be
> > requested in the future for no reason.
>
> Sure, so the ideal is to not drop unconditionally, but
> make sure that the "finished" O_STREAMING pages are
> the very first ones to go whenever memory pressure happens.
IMHO this shoudln't be taken care of. As you say it, a linux box has no
free memory (or it's been very recently booted), so the problem is not
to make O_STREAMING pages "low priority", but just to make them not stay
in the cache (perhaps just keep a few KB worth of them in case of a
limited seek back, but not more).
Xav
On 10-Oct-2002 Helge Hafting wrote:
> Giuliano Pochini wrote:
>
>> Yes, it makes sense, but it's useless or harmful to discard caches
>> if nobody else needs memory. You just lose data that may be
>> requested in the future for no reason.
>
> Sure, so the ideal is to not drop unconditionally, but
> make sure that the "finished" O_STREAMING pages are
> the very first ones to go whenever memory pressure happens.
>
> The question then becomes "can you do that, with no more
> overhead or code complexity than the existing stuff?"
I don't know enough of linux internals to suggest anything
really useful. Perhaps something like: "drop pages which
weren't already loaded when we requested them" might be
enough to prevent cached stuff used by other tasks to be
removed from memory.
> It wouldn't necessarily make much difference, because
> a linux machine is almost always under memory pressure.
> Free memory is simply filled up with cache till there
> is no more left. From then on, all requests for memory
> are handled by throwing something else out of cache
> or into swap. In that case the streaming pages
> are evicted quickly anyway, and the ideal case
> is no different from the implemented case.
O_STREAMING is a way to reduce cache footprint of some
files, ad it does the job very well, unless those files
are accessed concurrently by two of more processes.
Thing again about to backup a large database. I don't
want to use tar because it kills the caches. I would
like a way to read the db so that the cached part of
the db (the 20% which gets 80% of accesses) is not
expunged.
Bye.
> Look, the pagecache is already smart. New stuff will replace unusued
> old stuff. On VM pressure, the pagecache will be pruned. Streaming I/O
> is a fundamentally different problem in that the data is so large it
> _continually_ thrashes the pagecache. Such I/O is sequential and
> use-once. You end up with a permanent waste of memory (the cached
> I/O).
When a process opens a file with O_STREAMING, it tells the kernel
it will use the data only once, but it tells nothing about other
tasks. If that process reads something which is already cached,
then it must not drop it because someone other used it recently
and IMHO pagecache only should be allowed to drop it.
> Let's prove we have a solution to this problem before going after
> tangent ones.
If "solution" means "code", sorry, I can't help :(
Bye.
Alan Cox wrote:
>
> ...
> Instead of O_STREAMING therefore I'd much prefer to have
>
> fadvise(filehandle, offset, length, FADV_DONTNEED);
fadvise would make some sense - nice that it's a standardised interface.
It isn't really implementable in 2.4, because of that "offset, length"
thing. We either have to do a pagecache probe for each page, which
gets painful if the user asked for 10,000,000 pages or we do a
pagelist walk which is painful if the user asked for one page.
In 2.5, the radix tree gang lookup thing will do this search in O(zilch).
The other problem with fadvise is writebehind - there are up to
30 seconds' worth of dirty pages behind the application's write
cursor which fadvise wouldn't be able to do anything with. So
the application would end up running fadvise(offset=0, length=current-pos)
all the time. Which is equivalent to O_STREAMING.
dropbehind cannot work as effectively because we're basically forced
to put the pages at the head of the inactive LRU and hope that they're
written before they reach the tail. By which time we've evicted
all the other pagecache on the inactive list.
Could put the pages at the _tail_ of the LRU for reads; but that's
equivalent to just reclaiming them on the spot. Which is equivalent
to O_STREAMING.
On Wed, 9 Oct 2002, Erik Andersen wrote:
> On Thu Oct 10, 2002 at 12:20:02AM +0100, Jamie Lokier wrote:
> > Mark Mielke wrote:
> > > 2) Pages should not be candidates for dropping if the pages belong
> > > to the first few pages of a file. (First = 2? 4? 8?) The theory
> > > being, that somebody could begin reading the file again from the
> > > beginning.
> >
> > This breaks the benefit of using O_STREAMING to read a lot of small
> > files once, as you might do when grepping the kernel tree for example.
>
> I don't think grep is a very good candidate for O_STREAMING. I
> usually want the stuff I grep to stay in cache. O_STREAMING is
> much better suited to applications like ogle, vlc, xine, xmovie,
> xmms etc since there is little reason for the OS to cache things
> like songs and movies you aren't likely to hear/see again any
> time soon.
Personally I would settle for updatedb being converted.
Gerhard
--
Gerhard Mack
[email protected]
<>< As a computer I find your faith in technology amusing.
On Thu, 2002-10-10 at 16:34, Andrew Morton wrote:
> In 2.5, the radix tree gang lookup thing will do this search in O(zilch).
>
> The other problem with fadvise is writebehind - there are up to
> 30 seconds' worth of dirty pages behind the application's write
> cursor which fadvise wouldn't be able to do anything with. So
> the application would end up running fadvise(offset=0, length=current-pos)
> all the time. Which is equivalent to O_STREAMING.
The write side is actually not very interesting. We can do that already
with fsync in a thread.
On Thu, Oct 10, 2002 at 08:34:05AM -0700, Andrew Morton wrote:
>
> dropbehind cannot work as effectively because we're basically forced
> to put the pages at the head of the inactive LRU and hope that they're
> written before they reach the tail. By which time we've evicted
> all the other pagecache on the inactive list.
Seems like we're not keeping enough information. Perhaps we could have
something like "streaming rank" at the mapping level and use that as a
hint to prioritize evictions or possibly where to insert in the LRU.
Throw new mappings at the streaming end of the rank list and whenever
they fault behind the r/w pointer, swap them upwards in the list.
If you're playing an MP3, it starts out at the streaming end of the
list and starts dropping pages at the first sign of pressure. Since
this really is streaming, no problem.
Now run Mozilla, which starts out behind the MP3, seeks around
randomly paging in code (which looks like fault behind), says "um,
excused me", and bumps ahead of the MP3..
Let Mozilla idle, start OpenOffice, which bumps in front of the MP3,
and if it hits more pressure, bumps in front of the idling Mozilla as
well.
Then updatedb starts, reads a ton of files in a streaming fashion, but
none of those streaming file mappings push above the idling apps.
--
"Love the dolphins," she advised him. "Write by W.A.S.T.E.."
On Thu, Oct 10, 2002 at 12:55:08PM +0200, Helge Hafting wrote:
> Mark Mielke wrote:
> > I might be wrong, but it seems to me that O_STREAMING isn't the answer
> > to everything. The primary benefactors of O_STREAMING would be
> > applications that read very large files that do not fit into RAM, from
> > start to finish.
> It don't have to be a file that don't fit into RAM. Remember, other
> running apps wants memory and cache too, so the "fair share" of memory
> for _this_ process is much smaller than all of RAM.
> So, O_STREAMING makes sense for all files where we know that we're going
> sequentially and that caching this for long won't help.
> (Because the contents likely will be pushed out before we need
> them again anyway (DVD case) or we know were going to delete
> the file, or we simply don't want to push anything else
> out even if we could cache this.)
Then perhaps O_STREAMING should be called O_EXTENDEDSTREAMING.
If you overload O_STREAMING to contain all possibile uses for sequential
reads, you end up hurting yourself.
Small files are different beasts from large files. If you want O_STREAMING
to work in all cases, you really want standard mode to work in all cases,
and O_STREAMING is not for you.
mark
--
[email protected]/[email protected]/[email protected] __________________________
. . _ ._ . . .__ . . ._. .__ . . . .__ | Neighbourhood Coder
|\/| |_| |_| |/ |_ |\/| | |_ | |/ |_ |
| | | | | \ | \ |__ . | | .|. |__ |__ | \ |__ | Ottawa, Ontario, Canada
One ring to rule them all, one ring to find them, one ring to bring them all
and in the darkness bind them...
http://mark.mielke.cc/
On Thu Oct 10, 2002 at 12:33:07PM +0200, Marco Colombo wrote:
> True, but either you include kernel headers from user apps, or wait for
> glibc (or [whatever]libc) to catch up, or do something like this:
>
> #define O_STREAMING 04000000
>
> fd = open(file, ... | O_STREAMING);
>
> (quoted directly from one of Robert's messages).
I dunno about glibc, but I stuck support for O_STREAMING into
uClibc last night... :)
-Erik
--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
On Thu, Oct 10, 2002 at 03:17:35PM +0200, Giuliano Pochini wrote:
> O_STREAMING is a way to reduce cache footprint of some
> files, ad it does the job very well, unless those files
> are accessed concurrently by two of more processes.
> Thing again about to backup a large database. I don't
> want to use tar because it kills the caches. I would
> like a way to read the db so that the cached part of
> the db (the 20% which gets 80% of accesses) is not
> expunged.
Unless you are pausing the database (causing the files on disk to be in a
useful state) and then reading the file you will have trouble. Anything
else will have to syncronize with the database itself, and thus can't use
O_STREAMING.
On Thu, Oct 10, 2002 at 11:37:18AM -0400, Gerhard Mack wrote:
> Personally I would settle for updatedb being converted.
>
But updatedb doesn't actually read any of the files...
On Thu, Oct 10, 2002 at 03:39:19PM +0200, Giuliano Pochini wrote:
>
> > Look, the pagecache is already smart. New stuff will replace unusued
> > old stuff. On VM pressure, the pagecache will be pruned. Streaming I/O
> > is a fundamentally different problem in that the data is so large it
> > _continually_ thrashes the pagecache. Such I/O is sequential and
> > use-once. You end up with a permanent waste of memory (the cached
> > I/O).
>
> When a process opens a file with O_STREAMING, it tells the kernel
> it will use the data only once, but it tells nothing about other
> tasks. If that process reads something which is already cached,
> then it must not drop it because someone other used it recently
> and IMHO pagecache only should be allowed to drop it.
>
You are missing the point. If the app thinks that might happen, it
shouldn't use O_STREAMING.
Though, how do you get around some binary app using O_STREAMING when it
shouldn't?
On Thu Oct 10, 2002 at 03:50:50PM -0700, Mike Fedyk wrote:
> You are missing the point. If the app thinks that might happen, it
> shouldn't use O_STREAMING.
>
> Though, how do you get around some binary app using O_STREAMING when it
> shouldn't?
LD_PRELOAD to overload open(2) should do the job nicely
-Erik
--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
On Thu, 10 Oct 2002, Mike Fedyk wrote:
> On Thu, Oct 10, 2002 at 11:37:18AM -0400, Gerhard Mack wrote:
> > Personally I would settle for updatedb being converted.
> >
>
> But updatedb doesn't actually read any of the files...
Correct however it seems to be what shoves everything out of the cache
every night.
Gerhard
--
Gerhard Mack
[email protected]
<>< As a computer I find your faith in technology amusing.
On 2002-10-10, Mike Fedyk <[email protected]> wrote:
> On Thu, Oct 10, 2002 at 03:17:35PM +0200, Giuliano Pochini wrote:
> > Thing again about to backup a large database. I don't
> > want to use tar because it kills the caches. I would
> > like a way to read the db so that the cached part of
> > the db (the 20% which gets 80% of accesses) is not
> > expunged.
> Unless you are pausing the database (causing the files on disk to be in
> a useful state) and then reading the file you will have trouble.
> Anything else will have to syncronize with the database itself, and
> thus can't use O_STREAMING.
Pausing the database != putting the database into readonly mode, which is
all that would really be required. If your writer-processes are distinct
from your reader-processes, you could suspend them (and/or batch up writes
to temp tables to shrink your externally-felt maintenance window), tell
the DB to flush pending writes, then dump with O_STREAMING-aware tar (or
db-specific tools that still must pass through all tables/files) while
read performance is only somewhat impacted, and cache isn't completely
killed.
Or, consider the case where the database isn't anywhere near all that the
system does. Think static content + DB-driven webserver, where the DB
*can* be completely shut down (and those parts unavailable) during
backups, while static content serving still goes on efficiently.
--
Hank Leininger <[email protected]>
On 10-Oct-2002 Mike Fedyk wrote:
>> [...] I would
>> like a way to read the db so that the cached part of
>> the db (the 20% which gets 80% of accesses) is not
>> expunged.
>
> Unless you are pausing the database (causing the files on disk to be in a
> useful state) and then reading the file you will have trouble. Anything
> else will have to syncronize with the database itself, and thus can't use
> O_STREAMING.
All the cached db pages will be dropped regardless its state. Any
further access to the db will read the data from disk again. I'm
talking only about performance, not about db coherency.
Bye.
On Thu, Oct 10, 2002 at 10:14:37PM -0400, Gerhard Mack wrote:
> Correct however it seems to be what shoves everything out of the
> cache every night.
Because it reads all your metadata ... O_STREAMING won't help here.
--cw
>> When a process opens a file with O_STREAMING, it tells the kernel
>> it will use the data only once, but it tells nothing about other
>> tasks. If that process reads something which is already cached,
>> then it must not drop it because someone other used it recently
>> and IMHO pagecache only should be allowed to drop it.
>
> You are missing the point. If the app thinks that might happen, it
> shouldn't use O_STREAMING.
>
> Though, how do you get around some binary app using O_STREAMING when it
> shouldn't?
Yes, it is with the current behaviour of O_STREAMING. If we change it to
what I said above, O_STREAMING becomes useful in a larger set of cases
with no drawbacks, I think. To not drop pages that were not loaded with
O_STREAMING flag sounds simple, but I don't know how much it is easy to
implement.
Bye.
Mike Fedyk wrote:
>
> On Thu, Oct 10, 2002 at 03:39:19PM +0200, Giuliano Pochini wrote:
[...]
> > When a process opens a file with O_STREAMING, it tells the kernel
> > it will use the data only once, but it tells nothing about other
> > tasks. If that process reads something which is already cached,
> > then it must not drop it because someone other used it recently
> > and IMHO pagecache only should be allowed to drop it.
> >
>
> You are missing the point. If the app thinks that might happen, it
> shouldn't use O_STREAMING.
The app _can't_ know, so nothing should use O_STREAMING?
I think the idea seems good - if a page requested for streaming
happens to be in cache already, don't mark it for early eviction.
This approach ensures that streaming apps don't affect the caches
of other apps at all. There are way too many cases where
streaming is useful but we don't know if others might be using
the pages - perhaps in other ways.
Consider searching _all_ files on disk for some string. Clearly
a cache-killer that would benefit from streaming, without
streaming this pushes everything else from cache except
for the last files searched.
But streaming that unconditionally marks pages for eviction
will also kill all cache in this case. Both data and binaries.
Streaming that only evicts pages it _brought in_ can
search the entire disk (or some reasonable but large subset)
leaving almost all other cache intact.
Now, if we could make updatedb use this kind of streaming
for its directory traversals... :-)
Helge Hafting