--- linux-2.6-tmp/mm/msync.c.=K0000=.orig
+++ linux-2.6-tmp/mm/msync.c
@@ -127,13 +127,10 @@ static int filemap_sync(struct vm_area_s
/*
* MS_SYNC syncs the entire file - including mappings.
*
- * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
- * marks the relevant pages dirty. The application may now run fsync() to
- * write out the dirty pages and wait on the writeout and check the result.
- * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
- * async writeout immediately.
- * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
- * applications.
+ * MS_ASYNC once again starts I/O (it did not between 2.5.68 and 2.6.4.)
+ * SingleUnix requires it. If an application wants to queue dirty pages
+ * for normal asychronous writeback, msync with flags==0 should achieve
+ * that on all kernels at least as far back as 2.4.
*/
static int msync_interval(struct vm_area_struct * vma,
unsigned long start, unsigned long end, int flags)
@@ -147,20 +144,22 @@ static int msync_interval(struct vm_area
if (file && (vma->vm_flags & VM_SHARED)) {
ret = filemap_sync(vma, start, end-start, flags);
- if (!ret && (flags & MS_SYNC)) {
+ if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
struct address_space *mapping = file->f_mapping;
int err;
down(&mapping->host->i_sem);
ret = filemap_fdatawrite(mapping);
- if (file->f_op && file->f_op->fsync) {
- err = file->f_op->fsync(file,file->f_dentry,1);
- if (err && !ret)
+ if (flags & MS_SYNC) {
+ if (file->f_op && file->f_op->fsync) {
+ err = file->f_op->fsync(file, file->f_dentry, 1);
+ if (err && !ret)
+ ret = err;
+ }
+ err = filemap_fdatawait(mapping);
+ if (!ret)
ret = err;
}
- err = filemap_fdatawait(mapping);
- if (!ret)
- ret = err;
up(&mapping->host->i_sem);
}
}
On Wed, 31 Mar 2004, Stephen C. Tweedie wrote:
>
> although I can't find an unambiguous definition of "queued for service"
> in the online standard. I'm reading it as requiring that the I/O has
> reached the block device layer, not simply that it has been marked dirty
> for some future writeback pass to catch; Uli agrees with that
> interpretation.
That interpretation makes pretty much zero sense.
If you care about the data hitting the disk, you have to use fsync() or
similar _anyway_, and pretending anything else is just bogus.
As such, just marking the pages dirty is as much of a "queing" them for
write as actually writing them, since in both cases the guarantees are
_exactly_ the same: the pages have not hit the disk by the time the system
call returns, but will hit the disk at some time in the future.
Having the requirement that it is on some sw-only request queue is
nonsensical, since such a queue is totally invisible from a user
perspective.
User space has no idea about "block device layer" vs "VM layer" queues,
and trying to distinguiosh between the two is madness. It's just an
internal implementation issue that has no meaning to the user.
Linus
"Stephen C. Tweedie" <[email protected]> wrote:
>
> Hi,
>
> I've been looking at a discrepancy between msync() behaviour on 2.4.9
> and newer 2.4 kernels, and it looks like things changed again in
> 2.5.68. From the ChangeLog:
>
> ChangeSet 1.971.76.156 2003/04/09 11:31:36 [email protected]
> [PATCH] Make msync(MS_ASYNC) no longer start the I/O
>
> MS_ASYNC will currently wait on previously-submitted I/O, then start new I/O
> and not wait on it. This can cause undesirable blocking if msync is called
> rapidly against the same memory.
>
> So instead, change msync(MS_ASYNC) to not start any IO at all. Just flush
> the pte dirty bits into the pageframe and leave it at that.
>
> The IO _will_ happen within a kupdate period. And the application can use
> fsync() or fadvise(FADV_DONTNEED) if it actually wants to schedule the IO
> immediately.
>
> Unfortunately, this seems to contradict SingleUnix requirements, which
> state:
>
> When MS_ASYNC is specified, msync() shall return immediately
> once all the write operations are initiated or queued for
> servicing
>
> although I can't find an unambiguous definition of "queued for service"
> in the online standard. I'm reading it as requiring that the I/O has
> reached the block device layer, not simply that it has been marked dirty
> for some future writeback pass to catch; Uli agrees with that
> interpretation.
I don't think I agree with that. If "queued for service" means we've
started the I/O, then what does "initiated" mean, and why did they specify
"initiated" separately?
What triggered all this was a dinky little test app which Linus wrote to
time some aspect of P4 tlb writeback latency. It sits in a loop dirtying a
page then msyncing it with MS_ASYNC. It ran very poorly, because MS_ASYNC
ended up waiting on the previously-submitted I/O before starting new I/O.
One approach to improving that would be for MS_ASYNC to say "if the page is
already under writeout then just skip the I/O". But that's worthless,
really - it makes the MS_ASYNC semantics too vague.
As you point out, Linus's app should have used the "flags=0" linux
extension. Didn't think of that.
Your reversion patch would mean that current applications which use
MS_ASYNC will again suffer large latencies if the pages are under writeout.
Sure, users could switch apps to using flags=0 to avoid that, but people
don't know to do that.
So given that SUS is ambiguous about this, I'd suggest that we be able to
demonstrate some real-world reason why this matters. Why are you concerned
about this?
> The 2.5.68 changeset also includes the comment:
>
> (This has triggered an ext3 bug - the page's buffers get dirtied so fast
> that kjournald keeps writing the buffers over and over for 10-20 seconds
> before deciding to give up for some reason)
>
> Was that ever resolved? If it's still there, I should have a look at it
> if we're restoring the old trigger.
(These changelog thingies are useful, aren't they?)
I don't recall checking since that time. I expect that Linus's test app
will still livelock kjournals in the current -linus tree - kjournald sits
there trying to write out the dirty buffers but the dang things just keep
on getting dirtied.
If so, I'm sure this patch (queued for 2.6.6) will fix it:
ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.5-rc3/2.6.5-rc3-mm3/broken-out/jbd-move-locked-buffers.patch
Hi,
On Wed, 2004-03-31 at 23:53, Andrew Morton wrote:
> "Stephen C. Tweedie" <[email protected]> wrote:
> > Unfortunately, this seems to contradict SingleUnix requirements, which
> > state:
> > When MS_ASYNC is specified, msync() shall return immediately
> > once all the write operations are initiated or queued for
> > servicing
> > although I can't find an unambiguous definition of "queued for service"
> > in the online standard. I'm reading it as requiring that the I/O has
> > reached the block device layer
> I don't think I agree with that. If "queued for service" means we've
> started the I/O, then what does "initiated" mean, and why did they specify
> "initiated" separately?
I'd interpret "initiated" as having reached hardware. "Queued for
service" is much more open to interpretation: Uli came up with "the data
must be actively put in a stage where I/O is initiated", which still
doesn't really address what sort of queueing is allowed.
> What triggered all this was a dinky little test app which Linus wrote to
> time some aspect of P4 tlb writeback latency. It sits in a loop dirtying a
> page then msyncing it with MS_ASYNC. It ran very poorly, because MS_ASYNC
> ended up waiting on the previously-submitted I/O before starting new I/O.
Sure. There are lots of ways an interface can be misused, though: you
only know if one use is valid or not once you've determined what the
_correct_ use is. I'm much more concerned about getting a correct
interpretation of the spec than of making IO fast for the sake of a
memory benchmark. :-)
> One approach to improving that would be for MS_ASYNC to say "if the page is
> already under writeout then just skip the I/O". But that's worthless,
> really - it makes the MS_ASYNC semantics too vague.
Agreed.
> Your reversion patch would mean that current applications which use
> MS_ASYNC will again suffer large latencies if the pages are under writeout.
Well, this whole issue came up precisely because somebody was seeing
exactly such a latency hit going from 2.4.9 to a later kernel. We've
not really been consistent about it in the past.
> Sure, users could switch apps to using flags=0 to avoid that, but people
> don't know to do that.
Exactly why we need documentation for that combination, whatever
happens.
> So given that SUS is ambiguous about this, I'd suggest that we be able to
> demonstrate some real-world reason why this matters. Why are you concerned
> about this?
Just for the reason you mentioned --- a real-world app (in-house, so
flags==0 is actually a valid solution for them) which was seeing
performance degradation when the "MS_ASYNC submits IO" was introduced in
the first place. But it was internally written, so I've no idea at all
whether or not the app was assuming one behaviour or the other on other
Unixen.
--Stephen
Hi,
On Wed, 2004-03-31 at 23:37, Linus Torvalds wrote:
> On Wed, 31 Mar 2004, Stephen C. Tweedie wrote:
> >
> > although I can't find an unambiguous definition of "queued for service"
> > in the online standard. I'm reading it as requiring that the I/O has
> > reached the block device layer, not simply that it has been marked dirty
> > for some future writeback pass to catch; Uli agrees with that
> > interpretation.
>
> That interpretation makes pretty much zero sense.
>
> If you care about the data hitting the disk, you have to use fsync() or
> similar _anyway_, and pretending anything else is just bogus.
You can make the same argument for either implementation of MS_ASYNC.
And there's at least one way in which the "submit IO now" version can be
used meaningfully --- if you've got several specific areas of data in
one or more mappings that need flushed to disk, you'd be able to
initiate IO with multiple MS_ASYNC calls and then wait for completion
with either MS_SYNC or fsync(). That gives you an interface that
corresponds somewhat with the region-based filemap_sync();
filemap_fdatawrite(); filemap_datawait() that the kernel itself uses.
> Having the requirement that it is on some sw-only request queue is
> nonsensical, since such a queue is totally invisible from a user
> perspective.
It's very much visible, just from a performance perspective, if you want
to support "kick off this IO, I'm going to wait for the completion
shortly." If that's the interpretation of MS_ASYNC, then the app is
basically saying it doesn't want the writeback mechanism to be idle
until the writes have completed, regardless of whether it's a block
device or an NFS file or whatever underneath.
But whether that's a legal use of MS_ASYNC really depends on what the
standard is requiring. I could be persuaded either way. Uli?
Does anyone know what other Unixen do here?
--Stephen
On Wed, 1 Apr 2004, Stephen C. Tweedie wrote:
>
> On Wed, 2004-03-31 at 23:37, Linus Torvalds wrote:
>
> > If you care about the data hitting the disk, you have to use fsync() or
> > similar _anyway_, and pretending anything else is just bogus.
>
> You can make the same argument for either implementation of MS_ASYNC.
Exactly.
Which is why I say that the implementation cannot matter, because user
space would be _buggy_ if it depended on some timing issue.
> And there's at least one way in which the "submit IO now" version can be
> used meaningfully --- if you've got several specific areas of data in
> one or more mappings that need flushed to disk, you'd be able to
> initiate IO with multiple MS_ASYNC calls and then wait for completion
> with either MS_SYNC or fsync().
Why wouldn't you be able to do that with the current one?
Tha advantage of the current MS_ASYNC is absolutely astoundingly HUGE:
because we don't wait for in-progress IO, it can be used to efficiently
synchronize multiple different areas, and then after that waiting for them
with _one_ single fsync().
In contrast, the "wait for queued IO" approach can't sanely do that,
exactly because it will wait in the middle, depending on other activity at
the same time. It will always have the worry that it happens to do the
msync() at the wrong time, and then wait synchronously when it shouldn't.
More importanrtly, the current behaviour makes certain patterns _possible_
that your suggested semantics simply cannot do efficiently. If we have
data records smaller than a page, and want to mark them dirty as they
happen, the current msync() allows that - it doesn't matter that another
datum was marked dirty just a moment ago. Then, you do one fsync() only
when you actually want to _commit_ a series of updates before you change
the index.
But if we want to have another flag, with MS_HALF_ASYNC, that's certainly
ok by me. I'm all for choice. It's just that I most definitely want the
choice of doing it the way we do it now, since I consider that to be the
_sane_ way.
> It's very much visible, just from a performance perspective, if you want
> to support "kick off this IO, I'm going to wait for the completion
> shortly."
That may well be worth a call of its own. It has nothing to do with memory
mapping, though - what you're really looking for is fasync().
And yes, I agree that _that_ would make sense. Havign some primitives to
start writeout of an area of a file would likely be a good thing.
I'd be perfectly happy with a set of file cache control operations,
including
- start writeback in [a,b]
- wait for [a,b] stable
- and maybe "punch hole in [a,b]"
Then you could use these for write() in addition to mmap(), and you can
first mark multiple regions dirty, and then do a single wait (which is
clearly more efficient than synchronously waiting for multiple regions).
But none of these have anything to do with what SuS or any other standard
says about MS_ASYNC.
> But whether that's a legal use of MS_ASYNC really depends on what the
> standard is requiring. I could be persuaded either way. Uli?
My argument was that a standard CANNOT say anything one way or the other,
because the behaviour IS NOT USER-VISIBLE! A program fundamentally cannot
care, since the only issue is a pure implementation issue of "which queue"
the data got queued onto.
Bringing in a standards body is irrelevant. It's like trying to use the
bible to determine whether protons have a positive charge.
Linus
Linus Torvalds <[email protected]> wrote:
>
> I'd be perfectly happy with a set of file cache control operations,
> including
>
> - start writeback in [a,b]
> - wait for [a,b] stable
> - and maybe "punch hole in [a,b]"
Yup, there are a number of linux-specific fadvise() extensions we
can/should be adding, including "start writeback on this byte range for
flush" and "start writeback on this byte range for data integrity" and
"wait on writeback of this byte range".
Some of these are needed internally for the fs-AIO implementation, and also
for an O_SYNC which only writes the pages which the writer wrote. It's
pretty simple, and it'll be happening.
One wrinkle is that we'd need to add the start/end loff_t pair to the
a_ops->writepages() prototype. But instead I intend to put the start/end
info into struct writeback_control and pass it that way. It seems sleazy
at first but when you think about it, it isn't. It provides forward and
backward compatability, it recognises that it's just a hint and that
filesystems can legitimately sync the whole file and it produces
smaller+faster code.
We might need a wait_on_page_writeback_range() a_op though.
Hi,
On Thu, 2004-04-01 at 01:08, Linus Torvalds wrote:
> > You can make the same argument for either implementation of MS_ASYNC.
> Exactly.
> Which is why I say that the implementation cannot matter, because user
> space would be _buggy_ if it depended on some timing issue.
I see it purely as a performance issue. That's the context in which we
saw the initial complaint about the 2.4 behaviour change.
> > And there's at least one way in which the "submit IO now" version can be
> > used meaningfully --- if you've got several specific areas of data in
> > one or more mappings that need flushed to disk, you'd be able to
> > initiate IO with multiple MS_ASYNC calls and then wait for completion
> > with either MS_SYNC or fsync().
>
> Why wouldn't you be able to do that with the current one?
You can, but only across one fd.
A = mmap(..., a);
B = mmap(..., b);
msync(A, ..., MS_ASYNC);
msync(B, ..., MS_ASYNC);
fsync(a);
fsync(b);
has rather different performance characteristics according to which way
you go. Do deferred writeback and the two fsync()s do serialised IO,
with the fs idle in between. Submit the background IO immediately and
you avoid that.
Anyway, I just tried on a Solaris-2.8 box, and the results are rather
interesting. Doing a simple (touch-one-char, msync one page) loop on a
mmap'ed file on a local scsi disk, MS_ASYNC gives ~15000 msyncs per
second; MS_SYNC gives ~900. [A null getpid() loop gives about 250,000
loops a second.]
However, the "iostat" shows *exactly* the same disk throughput in each
case. MS_ASYNC is causing immediate IO kickoff, but shows only ~900 ios
per second, the same as the ios-per-second for MS_SYNC and the same as
the MS_SYNC loop frequency.
So it appears that on Solaris, MS_ASYNC is kicking off instant IO, but
is not waiting for existing IO to complete first. So if we have an IO
already in progress, then many msync calls end up queuing the *same*
subsequent IO, and once one new IO is queued, further MS_ASYNC msyncs
don't bother scheduling a new one (on the basis that the
already-scheduled one hasn't started yet so the new data is already
guaranteed to hit disk.)
So Solaris behaviour is indeed to begin IO as soon as possible on
MS_ASYNC, but they are doing it far more efficiently than our current
msync code can do.
> Tha advantage of the current MS_ASYNC is absolutely astoundingly HUGE:
> because we don't wait for in-progress IO, it can be used to efficiently
> synchronize multiple different areas, and then after that waiting for them
> with _one_ single fsync().
The Solaris one manages to preserve those properties while still
scheduling the IO "soon". I'm not sure how we could do that in the
current VFS, short of having a background thread scheduling deferred
writepage()s as soon as the existing page becomes unlocked.
> More importanrtly, the current behaviour makes certain patterns _possible_
> that your suggested semantics simply cannot do efficiently. If we have
> data records smaller than a page, and want to mark them dirty as they
> happen, the current msync() allows that - it doesn't matter that another
> datum was marked dirty just a moment ago. Then, you do one fsync() only
> when you actually want to _commit_ a series of updates before you change
> the index.
> But if we want to have another flag, with MS_HALF_ASYNC, that's certainly
> ok by me. I'm all for choice.
Yes, but we _used_ to have that choice --- call msync() with flags == 0,
and you'd get the deferred kupdated writeback; call it with MS_ASYNC and
you'd get instant IO kickoff; call it with MS_SYNC and you'd get
synchronous completion. But now we've lost the instant kickoff, async
completion option, and MS_ASYNC behaves just like flags==0.
So I'm all for adding the choice back, and *documenting* it so that
people know exactly what to expect in all three cases. Whether the
choice comes from an fadvise option or an msync() doesn't bother me that
much.
In that case, the decision about which version of the behaviour MS_ASYNC
should give is (as it should be) a matter of obeying the standard
correctly, and the other useful behaviours are preserved elsewhere.
Which brings us back to trying to interpret the vague standard. Both
Uli's interpretation and the Solaris implementation suggest that we need
to start the writepage sooner rather than later.
> > It's very much visible, just from a performance perspective, if you want
> > to support "kick off this IO, I'm going to wait for the completion
> > shortly."
>
> That may well be worth a call of its own. It has nothing to do with memory
> mapping, though - what you're really looking for is fasync().
Indeed. And msync(flags==0) remains as a way of synchronising mmaps
with the inode-dirty-list fasync writeback.
> And yes, I agree that _that_ would make sense. Havign some primitives to
> start writeout of an area of a file would likely be a good thing.
>
> I'd be perfectly happy with a set of file cache control operations,
> including
>
> - start writeback in [a,b]
posix_fadvise() seems to do something a little like this already: the
FADV_DONTNEED handler tries
if (!bdi_write_congested(mapping->backing_dev_info))
filemap_flush(mapping);
before going into the invalidate_mapping_pages() call. Having that (a)
limited to the specific file range passed into the fadvise(), and (b)
available as a separate function independent of the DONTNEED page
invalidator, would seem like an entirely sensible extension.
The obvious implementations would be somewhat inefficient in some cases,
though --- currently __filemap_fdatawrite simply list_splice()s the
inode dirty list into the io list. Walking a long dirty list to flush
just a few pages from a narrow range could get slow, and walking the
radix tree would be inefficient if there are only a few dirty pages
hidden in a large cache of clean pages.
> My argument was that a standard CANNOT say anything one way or the other,
> because the behaviour IS NOT USER-VISIBLE!
Worse, it doesn't seem to be implemented consistently either. I've been
trying on a few other Unixen while writing this. First on a Tru64 box,
and it is _not_ kicking off any IO at all for MS_ASYNC, except for the
30-second regular sync. The same appears to be true on FreeBSD. And on
HP-UX, things go in the other direction: the performance of MS_ASYNC is
identical to MS_SYNC, both in terms of observed disk IO during the sync
and the overall rate of the msync loop.
So it appears we've got Unix precedent for pretty-much any reasonable
interpretation of MS_ASYNC that we want. Patch withdrawn!
--Stephen
On Thu, 1 Apr 2004, Stephen C. Tweedie wrote:
>
> So it appears that on Solaris, MS_ASYNC is kicking off instant IO, but
> is not waiting for existing IO to complete first.
A much more likely schenario is that Solaris is really doing the same
thing we are, but it _also_ ends up opportunistically trying to put the
resultant pages on the IO queues if possible (ie do a "write-ahead": start
writeout if that doesn't imply blocking).
We could probably do that too, it seems easy enough. A
"TestSetPageLocked()" along with setting the BIO_RW_AHEAD flag. The only
problem is that I don't think we really have support for doing write-ahead
(ie we clear the page "dirty" bit too early, so if the write gets
cancelled due to the IO queues being full, the dirty bit gets lost.
So we don't want to go there for now, but it's something to keep in mind,
perhaps.
> Worse, it doesn't seem to be implemented consistently either. I've been
> trying on a few other Unixen while writing this. First on a Tru64 box,
> and it is _not_ kicking off any IO at all for MS_ASYNC, except for the
> 30-second regular sync. The same appears to be true on FreeBSD. And on
> HP-UX, things go in the other direction: the performance of MS_ASYNC is
> identical to MS_SYNC, both in terms of observed disk IO during the sync
> and the overall rate of the msync loop.
If you check HP-UX, make sure it's a recent one. HPUX has historically
been just too broken for words when it comes to mmap() (ie some _really_
strange semantics, like not being able to unmap partial mappings etc).
Linus
Stephen C. Tweedie wrote:
> Yes, but we _used_ to have that choice --- call msync() with flags == 0,
> and you'd get the deferred kupdated writeback;
Is that not equivalent to MS_INVALIDATE? It seems to be equivalent in
2.6.4.
The code in 2.6.4 ignores MS_INVALIDATE except for trivial error
checks, so msync() with flags == MS_INVALIDATE has the same effect as
msync() with flags == 0.
Some documentation I'm looking at says MS_INVALIDATE updates the
mapped page to contain the current contents of the file. 2.6.4 seems
to do the reverse: update the file to contain the current content of
the mapped page. "man msync" agrees with the the latter. (I can't
look at SUS right now).
On systems where the CPU caches are fully coherent, the only
difference is that the former is a no-op and the latter does the same
as the new behaviour of MS_ASYNC.
On systems where the CPU caches aren't coherent, some cache
synchronising or flushing operations are implied.
On either type of system, MS_INVALIDATE doesn't seem to be doing what
the documentation I'm looking at says it should do.
-- Jamie
Hi,
On Thu, 2004-04-01 at 17:02, Linus Torvalds wrote:
> > Worse, it doesn't seem to be implemented consistently either. I've been
> > trying on a few other Unixen while writing this. First on a Tru64 box,
> > and it is _not_ kicking off any IO at all for MS_ASYNC, except for the
> > 30-second regular sync. The same appears to be true on FreeBSD. And on
> > HP-UX, things go in the other direction: the performance of MS_ASYNC is
> > identical to MS_SYNC, both in terms of observed disk IO during the sync
> > and the overall rate of the msync loop.
>
> If you check HP-UX, make sure it's a recent one. HPUX has historically
> been just too broken for words when it comes to mmap() (ie some _really_
> strange semantics, like not being able to unmap partial mappings etc).
I'm not sure what counts as "recent" for that, but this was on HP-UX
11. That's the most recent I've got access to.
--Stephen
Hi,
On Thu, 2004-04-01 at 17:19, Jamie Lokier wrote:
> Some documentation I'm looking at says MS_INVALIDATE updates the
> mapped page to contain the current contents of the file. 2.6.4 seems
> to do the reverse: update the file to contain the current content of
> the mapped page. "man msync" agrees with the the latter. (I can't
> look at SUS right now).
btw, just looking at the filemap_sync_pte() code for MS_INVALIDATE, I
noticed
if (!PageReserved(page) &&
(ptep_clear_flush_dirty(vma, address, ptep) ||
page_test_and_clear_dirty(page)))
set_page_dirty(page);
I just happened to follow the function and noticed that on s390,
page_test_and_clear_dirty() has the comment:
* Test and clear dirty bit in storage key.
* We can't clear the changed bit atomically. This is a potential
* race against modification of the referenced bit. This function
* should therefore only be called if it is not mapped in any
* address space.
but in this case the page is clearly mapped in the caller's address
space, else we wouldn't have reached this.
Is this a problem?
--Stephen
Hi,
On Thu, 2004-04-01 at 17:19, Jamie Lokier wrote:
> Stephen C. Tweedie wrote:
> > Yes, but we _used_ to have that choice --- call msync() with flags == 0,
> > and you'd get the deferred kupdated writeback;
>
> Is that not equivalent to MS_INVALIDATE? It seems to be equivalent in
> 2.6.4.
It is in all the kernels I've looked at, but that's mainly because we
seem to ignore MS_INVALIDATE.
> Some documentation I'm looking at says MS_INVALIDATE updates the
> mapped page to contain the current contents of the file. 2.6.4 seems
> to do the reverse: update the file to contain the current content of
> the mapped page. "man msync" agrees with the the latter. (I can't
> look at SUS right now).
SUSv3 says
When MS_INVALIDATE is specified, msync() shall invalidate all
cached copies of mapped data that are inconsistent with the
permanent storage locations such that subsequent references
shall obtain data that was consistent with the permanent storage
locations sometime between the call to msync() and the first
subsequent memory reference to the data.
which seems to imply that dirty ptes should simply be cleared, rather
than propagated to the page dirty bits.
That's easy enough --- we already propagate the flags down to
filemap_sync_pte, where the page and pte dirty bits are modified. Does
anyone know any reason why we don't do MS_INVALIDATE there already?
--Stephen
"Stephen C. Tweedie" <[email protected]> wrote:
>
> > Tha advantage of the current MS_ASYNC is absolutely astoundingly HUGE:
> > because we don't wait for in-progress IO, it can be used to efficiently
> > synchronize multiple different areas, and then after that waiting for them
> > with _one_ single fsync().
>
> The Solaris one manages to preserve those properties while still
> scheduling the IO "soon". I'm not sure how we could do that in the
> current VFS, short of having a background thread scheduling deferred
> writepage()s as soon as the existing page becomes unlocked.
filemap_flush() will do exactly this. So if you want the Solaris
semantics, calling filemap_flush() intead of filemap_fdatawrite() should do
it.
> posix_fadvise() seems to do something a little like this already: the
> FADV_DONTNEED handler tries
>
> if (!bdi_write_congested(mapping->backing_dev_info))
> filemap_flush(mapping);
>
> before going into the invalidate_mapping_pages() call. Having that (a)
> limited to the specific file range passed into the fadvise(), and (b)
> available as a separate function independent of the DONTNEED page
> invalidator, would seem like an entirely sensible extension.
>
> The obvious implementations would be somewhat inefficient in some cases,
> though --- currently __filemap_fdatawrite simply list_splice()s the
> inode dirty list into the io list. Walking a long dirty list to flush
> just a few pages from a narrow range could get slow, and walking the
> radix tree would be inefficient if there are only a few dirty pages
> hidden in a large cache of clean pages.
The patches I have queued in -mm allow us to do this. We use
find_get_pages_tag() to iterate over only the dirty pages in the tree.
That still has the efficiency problem that when searching for dirty pages
we also visit pages which are both dirty and under writeback (we're not
interested in those pages if it is a non-blocking flush), although I've
only observed that to be a problem when the queue size was bumped up to
10,000 requests and I fixed that up for the common cases by other means.
Stephen C. Tweedie wrote:
> I've been looking at a discrepancy between msync() behaviour on 2.4.9
> and newer 2.4 kernels, and it looks like things changed again in
> 2.5.68.
When you say a discrepancy between 2.4.9 and newer 2.4 kernels, do you
mean that the msync() behaviour changed during the 2.4 series?
If so, what was the change?
Thanks,
-- Jamie
Hi,
On Fri, 2004-04-16 at 23:35, Jamie Lokier wrote:
> Stephen C. Tweedie wrote:
> > I've been looking at a discrepancy between msync() behaviour on 2.4.9
> > and newer 2.4 kernels, and it looks like things changed again in
> > 2.5.68.
>
> When you say a discrepancy between 2.4.9 and newer 2.4 kernels, do you
> mean that the msync() behaviour changed during the 2.4 series?
Yes.
> If so, what was the change?
2.4.9 behaved like current 2.6 --- on MS_ASYNC, it did a
set_page_dirty() which means the page will get picked up by the next
5-second bdflush pass. But later 2.4 kernels were changed so that they
started MS_ASYNC IO immediately with filemap_fdatasync() (which is
asynchronous regarding the new IO, but which blocks synchronously if
there is already old IO in flight on the page.)
That was reverted back to the earlier, 2.4.9 behaviour in the 2.5
series.
Cheers,
Stephen
Stephen C. Tweedie wrote:
> > If so, what was the change?
>
> 2.4.9 behaved like current 2.6 --- on MS_ASYNC, it did a
> set_page_dirty() which means the page will get picked up by the next
> 5-second bdflush pass. But later 2.4 kernels were changed so that they
> started MS_ASYNC IO immediately with filemap_fdatasync() (which is
> asynchronous regarding the new IO, but which blocks synchronously if
> there is already old IO in flight on the page.)
>
> That was reverted back to the earlier, 2.4.9 behaviour in the 2.5
> series.
It was 2.5.68.
Thanks, that's very helpful.
msync(0) has always had behaviour consistent with the <=2.4.9 and
>=2.5.68 MS_ASYNC behaviour, is that right?
If so, programs may as well "#define MS_ASYNC 0" on Linux, to get well
defined and consistent behaviour. It would be nice to change the
definition in libc to zero, but I don't think it's possible because
msync(MS_SYNC|MS_ASYNC) needs to fail.
-- Jamie
Hi,
On Wed, 2004-04-21 at 03:10, Jamie Lokier wrote:
> msync(0) has always had behaviour consistent with the <=2.4.9 and
> >=2.5.68 MS_ASYNC behaviour, is that right?
Not sure about "always", but it looks like it recently at least. 2.2
msync was implemented very differently but seems, from the source, to
have the same property --- do_write_page() calls f_op->write() on msync,
and MS_SYNC forces an fsync after the writes. But 2.4 and 2.6 share
much more similar code to each other. So all since 2.2 seem to do the
fully-async, deferred writeback behaviour for flags==0.
--Stephen
Sorry to bring this up only after a 2-year hiatus, but I'm trying to
port an application from Solaris and Linux 2.4 to 2.6 and finding amazing
performance regression due to this. (For referemce, as of 2.5.something,
msync(MS_ASYNC) just puts the pages on a dirty list but doesn't actually
start the I/O until some fairly coarse timer in the VM system fires.)
It uses msync(MS_ASYNC) and msync(MS_SYNC) as a poor man's portable
async IO. It's appending data to an on-disk log. When a page is full
or a transaction complete, the page will not be modified any further and
it uses MS_ASYNC to start the I/O as early as possible. (When compiled
with debugging, it also uses remaps the page read-only.)
Then it accomplishes as much as it can without needing the transaction
committed (typically 25-100 ms), and when it's blocked until the
transaction is known to be durable, it calls msync(MS_SYNC). Which
should, if everything went right, return immediately, because the page
is clean.
Reading the spec, this seems like exactly what msync() is designed for.
But looking at the 2.6 code, I see that it does't actually start the
write promptly, and makes the kooky and unusable suggestions to either
use fdatasync(fd), which will block on all the *following* transactions,
or fadvise(FADV_DONTNEED), which is emphatically lying to the kernel.
The data *is* needed in future (by the database *readers*), so discarding
it from memory is a stupid idea. The only thing we don't need to do to
the page any more is write to it.
Now, I know I could research when async-IO support became reasonably
stable and have the code do async I/O when on a recent enough kernel,
but I really wonder who the genius was who managed to misunderstand
"initiated or queued for servicing" enough to think it involves sleeping
with an idle disk.
Yes, it's just timing, but I hadn't noticed Linux developers being
ivory-tower academic types who only care about correctness or big-O
performance measures. In fact, "no, we can't prove it's starvation-free,
but damn it's fast on benchmarks!" is more of the attitude I've come
to expect.
Anyway, for future reference, Linux's current non-implementation of
msync(MS_ASYNC) is an outright BUG. It "computes the correct result",
but totally buggers performance.
(Deep breath) Now that I've finished complaining, I need to ask for
help. I may be inspired to fix the kernel, but first I have to fix my
application, which has to run on existing Linux kernels.
Can anyone advise me on the best way to perform this sort of split-
transaction disk write on extant 2.6 kernels? Preferably without using
glibc's pthread-based aio emulation? Will O_DIRECT and !O_SYNC writes
do what I want? Or will that interact bady with mmap()?
For my application, all transactions are completed in-order, so there is
never any question of which order to wait for completion. My current
guess is that I'm going to have to call io_submit directly; is there
any documentation with more detail than the man pages but less than the
source code? The former is silent on how the semantics of the various
IOCB_CMD_* opcodes, while the latter doesn't distinguish clearly between
the promises the interface is intended to keep and the properties of
the current implementation.
Thanks for any suggestions.
[email protected] wrote:
>
> Sorry to bring this up only after a 2-year hiatus, but I'm trying to
> port an application from Solaris and Linux 2.4 to 2.6 and finding amazing
> performance regression due to this. (For referemce, as of 2.5.something,
> msync(MS_ASYNC) just puts the pages on a dirty list but doesn't actually
> start the I/O until some fairly coarse timer in the VM system fires.)
>
> It uses msync(MS_ASYNC) and msync(MS_SYNC) as a poor man's portable
> async IO. It's appending data to an on-disk log. When a page is full
> or a transaction complete, the page will not be modified any further and
> it uses MS_ASYNC to start the I/O as early as possible. (When compiled
> with debugging, it also uses remaps the page read-only.)
>
> Then it accomplishes as much as it can without needing the transaction
> committed (typically 25-100 ms), and when it's blocked until the
> transaction is known to be durable, it calls msync(MS_SYNC). Which
> should, if everything went right, return immediately, because the page
> is clean.
2.4:
MS_ASYNC: dirty the pagecache pages, start I/O
MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
2.6:
MS_ASYNC: dirty the pagecache pages
MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
So you're saying that doing the I/O in that 25-100msec window allowed your
app to do more pipelining.
I think for most scenarios, what we have in 2.6 is better: it gives the app
more control over when the I/O should be started. But not for you, because
you have this handy 25-100ms window in which to do other stuff, which
eliminates the need to create a new thread to do the I/O.
Something like this? (Needs a triple-check).
Add two new linux-specific fadvise extensions():
LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
offsets `offset' and `offset+len'.
LINUX_FADV_SYNC_WRITE: start and wait upon writeout of any dirty pages between
file offsets `offset' and `offset+len'.
The patch also regularises the filemap_write_and_wait_range() API. Make it
look like the __filemap_fdatawrite_range() one: the `end' argument points at
the first byte beyond the range being written.
Signed-off-by: Andrew Morton <[email protected]>
---
fs/direct-io.c | 2 +-
include/linux/fadvise.h | 6 ++++++
include/linux/fs.h | 3 +++
mm/fadvise.c | 13 +++++++++++--
mm/filemap.c | 18 +++++++++++-------
5 files changed, 32 insertions(+), 10 deletions(-)
diff -puN mm/fadvise.c~fadvise-async-write-commands mm/fadvise.c
--- devel/mm/fadvise.c~fadvise-async-write-commands 2006-02-08 23:55:42.000000000 -0800
+++ devel-akpm/mm/fadvise.c 2006-02-09 00:16:58.000000000 -0800
@@ -15,6 +15,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
+#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <asm/unistd.h>
@@ -96,11 +97,19 @@ asmlinkage long sys_fadvise64_64(int fd,
filemap_flush(mapping);
/* First and last FULL page! */
- start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
if (end_index > start_index)
- invalidate_mapping_pages(mapping, start_index, end_index-1);
+ invalidate_mapping_pages(mapping, start_index,
+ end_index - 1);
+ break;
+ case LINUX_FADV_ASYNC_WRITE:
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
+ break;
+ case LINUX_FADV_SYNC_WRITE:
+ ret = filemap_write_and_wait_range(mapping, offset, endbyte);
break;
default:
ret = -EINVAL;
diff -puN include/linux/fadvise.h~fadvise-async-write-commands include/linux/fadvise.h
--- devel/include/linux/fadvise.h~fadvise-async-write-commands 2006-02-08 23:55:42.000000000 -0800
+++ devel-akpm/include/linux/fadvise.h 2006-02-08 23:56:55.000000000 -0800
@@ -18,4 +18,10 @@
#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
#endif
+/*
+ * Linux-specific fadvise() extensions:
+ */
+#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */
+#define LINUX_FADV_SYNC_WRITE 33 /* Write out and wait upon range */
+
#endif /* FADVISE_H_INCLUDED */
diff -puN mm/filemap.c~fadvise-async-write-commands mm/filemap.c
--- devel/mm/filemap.c~fadvise-async-write-commands 2006-02-08 23:59:01.000000000 -0800
+++ devel-akpm/mm/filemap.c 2006-02-09 00:10:40.000000000 -0800
@@ -174,7 +174,8 @@ static int sync_page(void *word)
* dirty pages that lie within the byte offsets <start, end>
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends
+ * @end: offset in bytes where the range ends (+1: we write end-start
+ * bytes)
* @sync_mode: enable synchronous operation
*
* If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -182,8 +183,8 @@ static int sync_page(void *word)
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*/
-static int __filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
{
int ret;
struct writeback_control wbc = {
@@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_sp
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
@@ -367,19 +368,22 @@ int filemap_write_and_wait(struct addres
}
EXPORT_SYMBOL(filemap_write_and_wait);
+/*
+ * Write out and wait upon all the bytes between lstart and (lend-1)
+ */
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
int err = 0;
- if (mapping->nrpages) {
+ if (mapping->nrpages && lend > lstart) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
if (err != -EIO) {
int err2 = wait_on_page_writeback_range(mapping,
lstart >> PAGE_CACHE_SHIFT,
- lend >> PAGE_CACHE_SHIFT);
+ (lend - 1) >> PAGE_CACHE_SHIFT);
if (!err)
err = err2;
}
diff -puN include/linux/fs.h~fadvise-async-write-commands include/linux/fs.h
--- devel/include/linux/fs.h~fadvise-async-write-commands 2006-02-08 23:59:24.000000000 -0800
+++ devel-akpm/include/linux/fs.h 2006-02-09 00:03:22.000000000 -0800
@@ -1476,6 +1476,9 @@ extern int filemap_fdatawait(struct addr
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
+extern int __filemap_fdatawrite_range(struct address_space *mapping,
+ loff_t start, loff_t end, int sync_mode);
+
extern void sync_supers(void);
extern void sync_filesystems(int wait);
extern void emergency_sync(void);
diff -puN fs/direct-io.c~fadvise-async-write-commands fs/direct-io.c
--- devel/fs/direct-io.c~fadvise-async-write-commands 2006-02-09 00:09:54.000000000 -0800
+++ devel-akpm/fs/direct-io.c 2006-02-09 00:10:06.000000000 -0800
@@ -1240,7 +1240,7 @@ __blockdev_direct_IO(int rw, struct kioc
}
retval = filemap_write_and_wait_range(mapping, offset,
- end - 1);
+ end);
if (retval) {
kfree(dio);
goto out;
_
Andrew Morton wrote:
>
> 2.4:
>
> MS_ASYNC: dirty the pagecache pages, start I/O
> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
>
> 2.6:
>
> MS_ASYNC: dirty the pagecache pages
> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
>
> So you're saying that doing the I/O in that 25-100msec window allowed your
> app to do more pipelining.
>
> I think for most scenarios, what we have in 2.6 is better: it gives the app
> more control over when the I/O should be started.
How so?
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Andrew Morton wrote:
>
> >
> > 2.4:
> >
> > MS_ASYNC: dirty the pagecache pages, start I/O
> > MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
> >
> > 2.6:
> >
> > MS_ASYNC: dirty the pagecache pages
> > MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
> >
> > So you're saying that doing the I/O in that 25-100msec window allowed your
> > app to do more pipelining.
> >
> > I think for most scenarios, what we have in 2.6 is better: it gives the app
> > more control over when the I/O should be started.
>
> How so?
>
Well, for example you might want to msync a number of disjoint parts of the
mapping, then write them all out in one hit.
Or you may not actually _want_ to start the I/O now - you just want pdflush
to write things back in a reasonable time period, so you don't have unsynced
data floating about in memory for eight hours. That's a quite reasonable
application of msync(MS_ASYNC).
> So you're saying that doing the I/O in that 25-100msec window allowed your
> app to do more pipelining.
Specifically, it allowed it to never block and have fast response times.
That's just the nature of two-phase commit: there's a period during
which your application doesn't know if the commit is durable or not.
But once your code supports having that time window, you can do a full
sliding window and avoid blocking on the completion of a transaction
that's not a prerequisite to the current one.
> I think for most scenarios, what we have in 2.6 is better: it gives the app
> more control over when the I/O should be started. But not for you, because
> you have this handy 25-100ms window in which to do other stuff, which
> eliminates the need to create a new thread to do the I/O.
Er... I fail to see how "push the dirty bit from internal level 1 to
internal level 2" gives the app any control. If the system is paging at
all, the page replacement algorithm will eventially notice a dirty page
that isn't being dirtied any more and clean it. So the 2.6 behaviour
changes one unknown and indefinite timeout to another unknown and
indefinite timeout. "Start (or, fo the disk is busy, queue) the I/O"
means it will be written out ASAP, basically as fast as a synchronous
write would do it, but without blocking. That's somewhat definite.
(I say "basically" because a scheduler that gives priority to synchronous
I/O is not unreasonable. But any delay should be due to the disk being
busy getting useful I/O done.)
I don't quite understand your point about the thread. Yes, the work to do
is not strictly serialized, so some of it can be started before knowing
the result of the most recently committed transaction. That's why I
want to do a split-transaction write: start writing at t1, do everything
that does not depend on the write, then wait for completion from t2..t3.
The idea is that an adequate t2-t1 will result in a very short t3-t2,
becuase the I/O latency is t3-t1.
It's the existence of msync(MS_ASYNC) that eliminates the need to create
a new thread to do the I/O, not the nature of the work. It's the nature
of the work that provides the opportunity to take advantage of overlapped
I/O, which wants a thread or some other form of synchronous I/O.
> Something like this? (Needs a triple-check).
Um, yes, thanks for the patch, except that I happen to think they should
be called msync(buf, len, MS_ASYNC) and msync(buf, len, MS_SYNC).
The current, not terribly useful, behaviour is adequately covered by
msync(buf, len, 0). That can be documented as "propagate the dirty
bits from the process' virtual address space to the file system buffer,
where it will be treated just like write(2) data: it will be written out
by the usual timer or can be written out by functions such as fsync().
Without using msync(), an fsync() call is not guaranteed to notice the
change." I don't know if there's an implicit msync(buf, len, 0) when
an address space is destroyed, but that would be good to document, too.
And, while I certainly don't mean to discourage kernel improvements,
my immediate problem is to find a solution (a "workaround", at least)
that works on 2.6.x, where x <= 15.
I thought with msync(), I had found something that was both efficient
and portable. Wishful thinking, it seems...
Anyway, thanks for the response!
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>Andrew Morton wrote:
>>
>>
>>>2.4:
>>>
>>> MS_ASYNC: dirty the pagecache pages, start I/O
>>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
>>>
>>>2.6:
>>>
>>> MS_ASYNC: dirty the pagecache pages
>>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
>>>
>>>So you're saying that doing the I/O in that 25-100msec window allowed your
>>>app to do more pipelining.
>>>
>>>I think for most scenarios, what we have in 2.6 is better: it gives the app
>>>more control over when the I/O should be started.
>>
>>How so?
>>
>
>
> Well, for example you might want to msync a number of disjoint parts of the
> mapping, then write them all out in one hit.
>
That should still be pretty efficient with 2.4 like behaviour? pdflush
does write them out in file offset order doesn't it?
> Or you may not actually _want_ to start the I/O now - you just want pdflush
> to write things back in a reasonable time period, so you don't have unsynced
> data floating about in memory for eight hours. That's a quite reasonable
> application of msync(MS_ASYNC).
>
I think data integrity requirements should be handled by MS_SYNC.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>Andrew Morton wrote:
>>
>>
>>>2.4:
>>>
>>> MS_ASYNC: dirty the pagecache pages, start I/O
>>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
>>>
>>>2.6:
>>>
>>> MS_ASYNC: dirty the pagecache pages
>>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
>>>
>>>So you're saying that doing the I/O in that 25-100msec window allowed your
>>>app to do more pipelining.
>>>
>>>I think for most scenarios, what we have in 2.6 is better: it gives the app
>>>more control over when the I/O should be started.
>>
>>How so?
>>
>
>
> Well, for example you might want to msync a number of disjoint parts of the
> mapping, then write them all out in one hit.
>
That should still be pretty efficient with 2.4 like behaviour? pdflush
does write them out in file offset order doesn't it?
> Or you may not actually _want_ to start the I/O now - you just want pdflush
> to write things back in a reasonable time period, so you don't have unsynced
> data floating about in memory for eight hours. That's a quite reasonable
> application of msync(MS_ASYNC).
>
I think data integrity requirements should be handled by MS_SYNC.
What the app does lose some control of is when IO actually should get started,
(MS_SYNC still allows it to control when IO *finishes*).
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Andrew Morton wrote:
> > Nick Piggin <[email protected]> wrote:
> >
> >>Andrew Morton wrote:
> >>
> >>
> >>>2.4:
> >>>
> >>> MS_ASYNC: dirty the pagecache pages, start I/O
> >>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O
> >>>
> >>>2.6:
> >>>
> >>> MS_ASYNC: dirty the pagecache pages
> >>> MS_SYNC: dirty the pagecache pages, start I/O, wait on I/O.
> >>>
> >>>So you're saying that doing the I/O in that 25-100msec window allowed your
> >>>app to do more pipelining.
> >>>
> >>>I think for most scenarios, what we have in 2.6 is better: it gives the app
> >>>more control over when the I/O should be started.
> >>
> >>How so?
> >>
> >
> >
> > Well, for example you might want to msync a number of disjoint parts of the
> > mapping, then write them all out in one hit.
> >
>
> That should still be pretty efficient with 2.4 like behaviour?
It's a bit of a disaster if you happen to msync(MS_ASYNC) the same page at
any sort of frequency - we have to wait for the previous I/O to complete
before new I/O can be started. That was the main problem which caused this
change to be made. You can see that it'd make 100x or 1000x speed improvements
with some sane access patterns.
> pdflush
> does write them out in file offset order doesn't it?
pdflush does, but an msync(MS_ASYNC) which starts I/O puts the IO order
into the application's control.
> > Or you may not actually _want_ to start the I/O now - you just want pdflush
> > to write things back in a reasonable time period, so you don't have unsynced
> > data floating about in memory for eight hours. That's a quite reasonable
> > application of msync(MS_ASYNC).
> >
>
> I think data integrity requirements should be handled by MS_SYNC.
Well that's always been the case. MS_ASYNC doesn't write metadata.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>Andrew Morton wrote:
>>
>>>
>>>Well, for example you might want to msync a number of disjoint parts of the
>>>mapping, then write them all out in one hit.
>>>
>>
>>That should still be pretty efficient with 2.4 like behaviour?
>
>
> It's a bit of a disaster if you happen to msync(MS_ASYNC) the same page at
> any sort of frequency - we have to wait for the previous I/O to complete
> before new I/O can be started. That was the main problem which caused this
> change to be made. You can see that it'd make 100x or 1000x speed improvements
> with some sane access patterns.
>
I'm not sure you'd have to do that, would you? Just move the dirty bit
from the pte and skip the page if it is found locked or writeback.
>
>>pdflush
>>does write them out in file offset order doesn't it?
>
>
> pdflush does, but an msync(MS_ASYNC) which starts I/O puts the IO order
> into the application's control.
>
I don't see a problem with that. There are plenty of ways to shoot oneself
in the foot.
>
>>>Or you may not actually _want_ to start the I/O now - you just want pdflush
>>>to write things back in a reasonable time period, so you don't have unsynced
>>>data floating about in memory for eight hours. That's a quite reasonable
>>>application of msync(MS_ASYNC).
>>>
>>
>>I think data integrity requirements should be handled by MS_SYNC.
>
>
> Well that's always been the case. MS_ASYNC doesn't write metadata.
>
>
So I don't understand your argument for using MS_ASYNC in that case.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> > It's a bit of a disaster if you happen to msync(MS_ASYNC) the same page at
> > any sort of frequency - we have to wait for the previous I/O to complete
> > before new I/O can be started. That was the main problem which caused this
> > change to be made. You can see that it'd make 100x or 1000x speed improvements
> > with some sane access patterns.
> >
>
> I'm not sure you'd have to do that, would you? Just move the dirty bit
> from the pte and skip the page if it is found locked or writeback.
That would make MS_ASYNC mean "start I/O now, unless there's I/O in
progress, in whch case start I/O in 30 seconds. That's not good.
If we're going to change the kernel, better off using fadvise()
enhancements, whic are also useful for post-write() operations.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>> > It's a bit of a disaster if you happen to msync(MS_ASYNC) the same page at
>> > any sort of frequency - we have to wait for the previous I/O to complete
>> > before new I/O can be started. That was the main problem which caused this
>> > change to be made. You can see that it'd make 100x or 1000x speed improvements
>> > with some sane access patterns.
>> >
>>
>> I'm not sure you'd have to do that, would you? Just move the dirty bit
>> from the pte and skip the page if it is found locked or writeback.
>
>
> That would make MS_ASYNC mean "start I/O now, unless there's I/O in
> progress, in whch case start I/O in 30 seconds. That's not good.
>
Yes, that change would make MS_ASYNC asynchronously start as much
IO as possible, as soon as possible. Which is good for the problem
reporter, who uses it to pipeline IOs and seems to have fairly good
control of when IO starts and finishes.
I don't think anyone would use MS_ASYNC for anything other than
performance improvement, so it is not like we need super well
defined behaviour... the earlier it will start IO AFAIKS the better.
> If we're going to change the kernel, better off using fadvise()
> enhancements, whic are also useful for post-write() operations.
>
I don't think there is any downside to changing MS_ASYNC either,
though.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> I don't think anyone would use MS_ASYNC for anything other than
> performance improvement, so it is not like we need super well
> defined behaviour... the earlier it will start IO AFAIKS the better.
Well, no. Consider a continuously-running application which modifies its
data store via MAP_SHARED+msync(MS_ASYNC). If the msync() immediately
started I/O, the disk would be seeking all over the place all the time. The
queue merging and timer-based unplugging would help here, but it won't be
as good as a big, infrequent ascending-file-offset pdflush pass.
Secondly, consider the behaviour of the above application if it is modifying
the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
immediately, that page will get written 10, 100 or 1000 times per second.
If MS_ASYNC leaves it to pdflush, that page gets written once per 30
seconds, so we do far much less I/O.
We just don't know. It's better to leave it up to the application designer
rather than lumping too many operations into the one syscall.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>> I don't think anyone would use MS_ASYNC for anything other than
>> performance improvement, so it is not like we need super well
>> defined behaviour... the earlier it will start IO AFAIKS the better.
>
>
> Well, no. Consider a continuously-running application which modifies its
> data store via MAP_SHARED+msync(MS_ASYNC). If the msync() immediately
> started I/O, the disk would be seeking all over the place all the time. The
> queue merging and timer-based unplugging would help here, but it won't be
> as good as a big, infrequent ascending-file-offset pdflush pass.
>
Sure you can shoot yourself in the foot.
"msync flushes changes made to the in-core copy of a file that
was mapped into memory using mmap(2) back to disk. "
We usually don't cater to foot shooters at the expense of valid users.
AFAIKS, basically the only valid use for MS_ASYNC is for the app to tell
the kernel that it isn't going to write here for a long time, so writeback
may as well be scheduled; or to pipeline other work with an upcoming data
integrity point which will need to be guaranteed by a second call to MS_SYNC.
> Secondly, consider the behaviour of the above application if it is modifying
> the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
> immediately, that page will get written 10, 100 or 1000 times per second.
> If MS_ASYNC leaves it to pdflush, that page gets written once per 30
> seconds, so we do far much less I/O.
>
> We just don't know. It's better to leave it up to the application designer
> rather than lumping too many operations into the one syscall.
Well it remains the same conceptual operation (asynchronously "schedule"
dirty pages for writeout). However it simply becomes more useful to start
the writeout immediately, given that's the (pretty explicit) hint that is
given to us.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> > Secondly, consider the behaviour of the above application if it is modifying
> > the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
> > immediately, that page will get written 10, 100 or 1000 times per second.
> > If MS_ASYNC leaves it to pdflush, that page gets written once per 30
> > seconds, so we do far much less I/O.
> >
> > We just don't know. It's better to leave it up to the application designer
> > rather than lumping too many operations into the one syscall.
>
> Well it remains the same conceptual operation (asynchronously "schedule"
> dirty pages for writeout). However it simply becomes more useful to start
> the writeout immediately, given that's the (pretty explicit) hint that is
> given to us.
If you want to start the I/O now, fine, start the I/O now.
If you don't want to start I/O now, fine, don't start I/O now.
If msync() were to unconditionally start I/O, you don't get that option.
It's pretty simple, isn't it?
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>>Secondly, consider the behaviour of the above application if it is modifying
>>
>> > the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
>> > immediately, that page will get written 10, 100 or 1000 times per second.
>> > If MS_ASYNC leaves it to pdflush, that page gets written once per 30
>> > seconds, so we do far much less I/O.
>> >
>> > We just don't know. It's better to leave it up to the application designer
>> > rather than lumping too many operations into the one syscall.
>>
>> Well it remains the same conceptual operation (asynchronously "schedule"
>> dirty pages for writeout). However it simply becomes more useful to start
>> the writeout immediately, given that's the (pretty explicit) hint that is
>> given to us.
>
>
> If you want to start the I/O now, fine, start the I/O now.
>
> If you don't want to start I/O now, fine, don't start I/O now.
>
> If msync() were to unconditionally start I/O, you don't get that option.
>
Huh? Sure you do.
If you want to start the IO *now* without waiting on it, call msync(MS_ASYNC)
If you don't want to start the IO now, that's really easy, do nothing.
If you want to start the IO now and also wait for it to finish, call msync(MS_SYNC)
Presently, the first option is unavailable.
> It's pretty simple, isn't it?
>
Yes.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> If you want to start the IO *now* without waiting on it, call msync(MS_ASYNC)
> If you don't want to start the IO now, that's really easy, do nothing.
> If you want to start the IO now and also wait for it to finish, call msync(MS_SYNC)
I've already explained the problems with the start-io-in-MS_ASYNC approach.
> Presently, the first option is unavailable.
We need to patch the kernel either way. There's no point in going back to
either the known-problematic approach or to something half-assed.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>If you want to start the IO *now* without waiting on it, call msync(MS_ASYNC)
>> If you don't want to start the IO now, that's really easy, do nothing.
>> If you want to start the IO now and also wait for it to finish, call msync(MS_SYNC)
>
>
> I've already explained the problems with the start-io-in-MS_ASYNC approach.
>
But I've explained that they only matter for people using it in stupid ways.
fsync also poses a performance problem for programs that call it after every
write(2).
>
>> Presently, the first option is unavailable.
>
>
> We need to patch the kernel either way. There's no point in going back to
> either the known-problematic approach or to something half-assed.
>
The system call indicates to the kernel that IO submission should be started.
The earlier the kernel does that, the better (because it is likely that an
MS_SYNC is coming soon).
I think the current way of just moving the dirty bits is half-assed.
Is a more efficient implementation know-problematic? What applications did
you observe problems with, can you remember? Because the current behaviour
is also known-problematic for [email protected] (who are you anyway?)
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Andrew Morton wrote:
> > Nick Piggin <[email protected]> wrote:
> >
> >>If you want to start the IO *now* without waiting on it, call msync(MS_ASYNC)
> >> If you don't want to start the IO now, that's really easy, do nothing.
> >> If you want to start the IO now and also wait for it to finish, call msync(MS_SYNC)
> >
> >
> > I've already explained the problems with the start-io-in-MS_ASYNC approach.
> >
>
> But I've explained that they only matter for people using it in stupid ways.
> fsync also poses a performance problem for programs that call it after every
> write(2).
There's absolutely nothing stupid about
*p = <expr>
msync(p, sizeof(*p), MS_ASYNC);
> >
> >> Presently, the first option is unavailable.
> >
> >
> > We need to patch the kernel either way. There's no point in going back to
> > either the known-problematic approach or to something half-assed.
> >
>
> The system call indicates to the kernel that IO submission should be started.
> The earlier the kernel does that, the better (because it is likely that an
> MS_SYNC is coming soon).
>
> I think the current way of just moving the dirty bits is half-assed.
>
> Is a more efficient implementation know-problematic?
It's less efficient for some things. A lot.
> What applications did
> you observe problems with, can you remember?
Linus has some application which was doing the above. It ran extremely
slowly, so we changed MS_ASYNC (ie: made it "more efficient"...)
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>>But I've explained that they only matter for people using it in stupid ways.
>>fsync also poses a performance problem for programs that call it after every
>>write(2).
>
>
> There's absolutely nothing stupid about
>
> *p = <expr>
> msync(p, sizeof(*p), MS_ASYNC);
>
There really is if you're expecting a short time later to do
*p = <expr2>
and had no need for a MS_SYNC anywhere in the meantime.
If you did have the need for MS_SYNC, then kicking off the IO
ASAP is going to be more efficient.
>>
>>Is a more efficient implementation know-problematic?
>
>
> It's less efficient for some things. A lot.
>
But only for stupid things, right?
>
>>What applications did
>>you observe problems with, can you remember?
>
>
> Linus has some application which was doing the above. It ran extremely
> slowly, so we changed MS_ASYNC (ie: made it "more efficient"...)
Can he remember what it is? It sounds like it is broken.
OTOH, it could have been blocking on pages already under writeout
but a smarter implementation could ignore those (at the cost of
worse IO efficiency in these rare cases).
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Andrew Morton wrote:
> > Nick Piggin <[email protected]> wrote:
>
> >>But I've explained that they only matter for people using it in stupid ways.
> >>fsync also poses a performance problem for programs that call it after every
> >>write(2).
> >
> >
> > There's absolutely nothing stupid about
> >
> > *p = <expr>
> > msync(p, sizeof(*p), MS_ASYNC);
> >
>
> There really is if you're expecting a short time later to do
>
> *p = <expr2>
>
> and had no need for a MS_SYNC anywhere in the meantime.
> If you did have the need for MS_SYNC, then kicking off the IO
> ASAP is going to be more efficient.
Of course these sorts of applications don't know what they'll be doing in
the future. Often the location of the next update is driven by something
which came across the network.
> >>
> >>Is a more efficient implementation know-problematic?
> >
> >
> > It's less efficient for some things. A lot.
> >
>
> But only for stupid things, right?
No.
> >
> >>What applications did
> >>you observe problems with, can you remember?
> >
> >
> > Linus has some application which was doing the above. It ran extremely
> > slowly, so we changed MS_ASYNC (ie: made it "more efficient"...)
>
> Can he remember what it is? It sounds like it is broken.
>
> OTOH, it could have been blocking on pages already under writeout
> but a smarter implementation could ignore those (at the cost of
> worse IO efficiency in these rare cases).
There's no need to do that. Look:
msync(MS_ASYNC): propagate pte dirty flags into pagecache
LINUX_FADV_ASYNC_WRITE: start writeback on all pages in region which are
dirty and which aren't presently under writeback.
LINUX_FADV_WRITE_WAIT: wait on writeback of all pages in range.
I think that covers all conceivable scenarios. One thing per operation,
leave the decisions and tuning up to the application. And it gives us two
operations which are also useful in association with regular write().
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>>and had no need for a MS_SYNC anywhere in the meantime.
>>If you did have the need for MS_SYNC, then kicking off the IO
>>ASAP is going to be more efficient.
>
>
> Of course these sorts of applications don't know what they'll be doing in
> the future. Often the location of the next update is driven by something
> which came across the network.
>
If there is no actual need for the application to start a write (eg
for data integrity) then why would it ever do that?
>
> There's no need to do that. Look:
>
> msync(MS_ASYNC): propagate pte dirty flags into pagecache
>
> LINUX_FADV_ASYNC_WRITE: start writeback on all pages in region which are
> dirty and which aren't presently under writeback.
>
> LINUX_FADV_WRITE_WAIT: wait on writeback of all pages in range.
>
> I think that covers all conceivable scenarios. One thing per operation,
> leave the decisions and tuning up to the application. And it gives us two
> operations which are also useful in association with regular write().
>
Oh yeah it is easy if you want to define some more APIs and do
it in a Linux specific way.
But the main function of msync(MS_ASYNC) AFAIK is to *start* IO.
Why do we care so much if some application goes stupid with it?
Why not introduce a linux specific MS_flag to propogate pte dirty
bits?
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Andrew Morton wrote:
> > Nick Piggin <[email protected]> wrote:
>
> >>and had no need for a MS_SYNC anywhere in the meantime.
> >>If you did have the need for MS_SYNC, then kicking off the IO
> >>ASAP is going to be more efficient.
> >
> >
> > Of course these sorts of applications don't know what they'll be doing in
> > the future. Often the location of the next update is driven by something
> > which came across the network.
> >
>
> If there is no actual need for the application to start a write (eg
> for data integrity) then why would it ever do that?
To get the data sent to disk in a reasonable amount of time - don't leave it
floating about in memory for hours or days.
> >
> > There's no need to do that. Look:
> >
> > msync(MS_ASYNC): propagate pte dirty flags into pagecache
> >
> > LINUX_FADV_ASYNC_WRITE: start writeback on all pages in region which are
> > dirty and which aren't presently under writeback.
> >
> > LINUX_FADV_WRITE_WAIT: wait on writeback of all pages in range.
> >
> > I think that covers all conceivable scenarios. One thing per operation,
> > leave the decisions and tuning up to the application. And it gives us two
> > operations which are also useful in association with regular write().
> >
>
> Oh yeah it is easy if you want to define some more APIs and do
> it in a Linux specific way.
>
> But the main function of msync(MS_ASYNC) AFAIK is to *start* IO.
> Why do we care so much if some application goes stupid with it?
Because delaying the writeback to permit combining is a good optimisation.
The alternative of not starting new writeout of a dirty page if that page
happens to be under writeout at the time is neither one nor the other.
With that proposal, if the application really wants IO started right now,
then it's going to have to use msync(MS_SYNC).
> Why not introduce a linux specific MS_flag to propogate pte dirty
> bits?
That's what MS_ASYNC already does. We're agreed that something needs to
change and we're just discussing what that is. I'm proposing something
which is complete and flexible.
Another point here is that msync(MS_SYNC) starts writeout of _all_ dirty
pages in the file (as MS_ASYNC used to do) and it waits upon writeback of
the whole file. That's quite inefficient for an app which has lots of
threads writing to and msync()ing the same MAP_SHARED file.
We could easily enough convert msync() to only operate on the affected
region of the (non-linearly-mapped) file. But I don't think we can do that
now, because people might be relying upon the side-effects.
The fadvise() extensions allow us to fix this. And we've needed them for
some time for regular write()s anyway.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>>If there is no actual need for the application to start a write (eg
>>for data integrity) then why would it ever do that?
>
>
> To get the data sent to disk in a reasonable amount of time - don't leave it
> floating about in memory for hours or days.
>
This is a Linux implementation detail. As such it would make sense to
introduce a new Linux specific MS_ flag for this.
>>Oh yeah it is easy if you want to define some more APIs and do
>>it in a Linux specific way.
>>
>>But the main function of msync(MS_ASYNC) AFAIK is to *start* IO.
>>Why do we care so much if some application goes stupid with it?
>
>
> Because delaying the writeback to permit combining is a good optimisation.
>
Definitely. And when the app gives us a hint that it really wants the
data on the disk, starting it as early as possible is also a good
optimisation.
>
>>Why not introduce a linux specific MS_flag to propogate pte dirty
>>bits?
>
>
> That's what MS_ASYNC already does. We're agreed that something needs to
> change and we're just discussing what that is. I'm proposing something
> which is complete and flexible.
>
I don't think there's anything wrong with your fadvise additions.
I'd rather see MS_ASYNC start IO immediately and add another MS_
flag for Linux to propogate bits.
MS_ASYNC behaviour would also somewhat match your proposed FADV_ASYNC
behaviour.
>
>
> Another point here is that msync(MS_SYNC) starts writeout of _all_ dirty
> pages in the file (as MS_ASYNC used to do) and it waits upon writeback of
> the whole file. That's quite inefficient for an app which has lots of
> threads writing to and msync()ing the same MAP_SHARED file.
>
> We could easily enough convert msync() to only operate on the affected
> region of the (non-linearly-mapped) file. But I don't think we can do that
> now, because people might be relying upon the side-effects.
>
I think if the interface was always documented correctly then we should
be able to. If the app breaks it was buggy anyway.
> The fadvise() extensions allow us to fix this. And we've needed them for
> some time for regular write()s anyway.
>
Yes they'd be nice.
Instead of
LINUX_FADV_ASYNC_WRITE
LINUX_FADV_WRITE_WAIT
can we have something more consistent? Perhaps
FADV_WRITE_ASYNC
FADV_WRITE_SYNC
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin <[email protected]> wrote:
>
> Instead of
> LINUX_FADV_ASYNC_WRITE
> LINUX_FADV_WRITE_WAIT
>
> can we have something more consistent? Perhaps
> FADV_WRITE_ASYNC
> FADV_WRITE_SYNC
Nope, I had a bit of a think about this and decided that the two operations
which we need are:
From: Andrew Morton <[email protected]>
Add two new linux-specific fadvise extensions():
LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
offsets `offset' and `offset+len'. Any pages which are currently under
writeout are skipped, whether or not they are dirty.
LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
offsets `offset' and `offset+len'.
By combining these two operations the application may do several things:
LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty
pages at the disk.
LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all
of the currently dirty pages at the disk, wait until they have been written.
It should be noted that none of these operations write out the file's
metadata. So unless the application is strictly performing overwrites of
already-instantiated disk blocks, there are no guarantees here that the data
will be available after a crash.
To complete this suite of operations I guess we should have a "sync file
metadata only" operation. This gives applications access to all the building
blocks needed for all sorts of sync operations. But sync-metadata doesn't fit
well with the fadvise() interface. Probably it should be a new syscall:
sys_fmetadatasync().
The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64().
It is made to represent that last affected byte in the file (ie: it is
inclusive). Generally, all these byterange and pagerange functions are
inclusive so we can easily represent EOF with -1.
Signed-off-by: Andrew Morton <[email protected]>
---
include/linux/fadvise.h | 6 ++++
include/linux/fs.h | 5 ++++
mm/fadvise.c | 46 +++++++++++++++++++++++++++++++++-----
mm/filemap.c | 10 ++++----
4 files changed, 57 insertions(+), 10 deletions(-)
diff -puN include/linux/fadvise.h~fadvise-async-write-commands include/linux/fadvise.h
--- devel/include/linux/fadvise.h~fadvise-async-write-commands 2006-02-09 22:29:36.000000000 -0800
+++ devel-akpm/include/linux/fadvise.h 2006-02-09 22:29:36.000000000 -0800
@@ -18,4 +18,10 @@
#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
#endif
+/*
+ * Linux-specific fadvise() extensions:
+ */
+#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */
+#define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */
+
#endif /* FADVISE_H_INCLUDED */
diff -puN include/linux/fs.h~fadvise-async-write-commands include/linux/fs.h
--- devel/include/linux/fs.h~fadvise-async-write-commands 2006-02-09 22:29:36.000000000 -0800
+++ devel-akpm/include/linux/fs.h 2006-02-09 23:06:03.000000000 -0800
@@ -1473,6 +1473,11 @@ extern int filemap_fdatawait(struct addr
extern int filemap_write_and_wait(struct address_space *mapping);
extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
+extern int wait_on_page_writeback_range(struct address_space *mapping,
+ pgoff_t start, pgoff_t end);
+extern int __filemap_fdatawrite_range(struct address_space *mapping,
+ loff_t start, loff_t end, int sync_mode);
+
extern void sync_supers(void);
extern void sync_filesystems(int wait);
extern void emergency_sync(void);
diff -puN mm/fadvise.c~fadvise-async-write-commands mm/fadvise.c
--- devel/mm/fadvise.c~fadvise-async-write-commands 2006-02-09 22:29:36.000000000 -0800
+++ devel-akpm/mm/fadvise.c 2006-02-09 23:12:22.000000000 -0800
@@ -15,6 +15,7 @@
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
+#include <linux/writeback.h>
#include <linux/syscalls.h>
#include <asm/unistd.h>
@@ -22,13 +23,36 @@
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive. Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata. So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
*/
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
struct file *file = fget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
- loff_t endbyte;
+ loff_t endbyte; /* inclusive */
pgoff_t start_index;
pgoff_t end_index;
unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd,
endbyte = offset + len;
if (!len || endbyte < len)
endbyte = -1;
+ else
+ endbyte--; /* inclusive */
bdi = mapping->backing_dev_info;
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd,
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
- end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+ end_index = endbyte >> PAGE_CACHE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd,
filemap_flush(mapping);
/* First and last FULL page! */
- start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
- if (end_index > start_index)
- invalidate_mapping_pages(mapping, start_index, end_index-1);
+ if (end_index >= start_index)
+ invalidate_mapping_pages(mapping, start_index,
+ end_index);
+ break;
+ case LINUX_FADV_ASYNC_WRITE:
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
+ break;
+ case LINUX_FADV_WRITE_WAIT:
+ ret = wait_on_page_writeback_range(mapping,
+ offset >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
break;
default:
ret = -EINVAL;
diff -puN mm/filemap.c~fadvise-async-write-commands mm/filemap.c
--- devel/mm/filemap.c~fadvise-async-write-commands 2006-02-09 22:29:36.000000000 -0800
+++ devel-akpm/mm/filemap.c 2006-02-09 23:05:56.000000000 -0800
@@ -181,8 +181,8 @@ static int sync_page(void *word)
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
*/
-static int __filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end, int sync_mode)
{
int ret;
struct writeback_control wbc = {
@@ -211,8 +211,8 @@ int filemap_fdatawrite(struct address_sp
}
EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping,
- loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
{
return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
@@ -231,7 +231,7 @@ EXPORT_SYMBOL(filemap_flush);
* Wait for writeback to complete against pages indexed by start->end
* inclusive
*/
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
_
> Well, no. Consider a continuously-running application which modifies its
> data store via MAP_SHARED+msync(MS_ASYNC). If the msync() immediately
> started I/O, the disk would be seeking all over the place all the time. The
> queue merging and timer-based unplugging would help here, but it won't be
> as good as a big, infrequent ascending-file-offset pdflush pass.
>
> Secondly, consider the behaviour of the above application if it is modifying
> the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
> immediately, that page will get written 10, 100 or 1000 times per second.
> If MS_ASYNC leaves it to pdflush, that page gets written once per 30
> seconds, so we do far much less I/O.
You're assuming a brain-dead application. Which can already thrash the
disk very nicely with O_SYNC. Yes, if you ask for control and then do
something stupid, you can send performance into the toilet.
That's not a reason to not do what the application asks unless it's
a serious DoS attack.
(For example, In my appliction, I'm using a raw device as a circular
buffer, so I'm already delivering perfectly sequential block numbers.
And it's a flash memory disk anyway.)
> We just don't know. It's better to leave it up to the application designer
> rather than lumping too many operations into the one syscall.
I know the operating system doesn't know. If it did, there wouldn't
be any need for the application to tell it by making a system call.
So do what the application asks for, which is what the SuS says
msync(MS_ASYNC) means, which is start the write immediately.
(I'd call it "I/O", but it's only "O".)
As I said, I'm actively looking for a way, on Linux 2.6.x, x <= 15,
to start disk writes on part of an mmapped file without either blocking
(yet) or writing other dirty pages that aren't complete yet.
[email protected] wrote:
>
> > Well, no. Consider a continuously-running application which modifies its
> > data store via MAP_SHARED+msync(MS_ASYNC). If the msync() immediately
> > started I/O, the disk would be seeking all over the place all the time. The
> > queue merging and timer-based unplugging would help here, but it won't be
> > as good as a big, infrequent ascending-file-offset pdflush pass.
> >
> > Secondly, consider the behaviour of the above application if it is modifying
> > the same page relatively frequently (quite likely). If MS_ASYNC starts I/O
> > immediately, that page will get written 10, 100 or 1000 times per second.
> > If MS_ASYNC leaves it to pdflush, that page gets written once per 30
> > seconds, so we do far much less I/O.
>
> You're assuming a brain-dead application.
We've covered this. Handing pte-dirty pages over to pdflush for prompt
writeback is a perfectly valid, sensible and fast thing to do.
It efficiently solves the single biggest problem with using MAP_SHARED
instead of write().
> As I said, I'm actively looking for a way, on Linux 2.6.x, x <= 15,
> to start disk writes on part of an mmapped file without either blocking
> (yet)
I cannot think of a way, sorry.
> or writing other dirty pages that aren't complete yet.
msync() will write all of the file's dirty pages and it has always has done
that.
>> But the main function of msync(MS_ASYNC) AFAIK is to *start* IO.
>> Why do we care so much if some application goes stupid with it?
>
> Because delaying the writeback to permit combining is a good optimisation.
In *some* cases. The application may very well know that there won't
be any following writes to combine with.
> The alternative of not starting new writeout of a dirty page if that page
> happens to be under writeout at the time is neither one nor the other.
It's a sub-optimal kludge, but it's something. As everyone is perfectly
aware, msync(MS_ASYNC) is *only* a performanc optimization; you cannot
rely on it for correctness because the time to do the write is not
bounded. So if the OS screws up occasionally, not a disaster.
So Linux has a limitation that it can't start a second write on a
particular page that's already being written. (It seems like a simple
flag, tested on completion of the first writeback, would solve that
problem.)
But msync() means nothing unless people are writing to a file, and
concurrent writers have to cooperate anyway, so I don't see this as
being a big problem in practice. MS_ASYNC is a performace optimization,
so it only has to work most of the time.
Thus, this is a perfectly acceptable solution.
For example, my application only calls msync(MS_ASYNC) on a particular
page once, ever, as soon as it knows there will be no more writes to
that page. Thus, the problem would never occur. It might be nice to
extend Linux to cope gracefully with the case where I start the write
when I'm 99% sure there will be no more data (but just might be wrong),
but I don't think that's done too commonly.
>> Why not introduce a linux specific MS_flag to propogate pte dirty
>> bits?
> That's what MS_ASYNC already does.
Yes, in violation of the SuS spec. That's what msync(0) already does,
too, so the linux-specific extension already exists.
The standard description of MS_INVALIDATE is very confusing and poorly
worded, but I think it's designed for a model where mmap() copies rather
than playing page table tricks, and the OS has to copy the dirty pages
back and forth between the buffer cache "by hand". Looked at that way,
the MS_INVALIDATE wording seems to be intended as something of a "commit
memory writes back to the file system level" operation.
Which could also be expected to cause the traditional 30-second sync
timeout to start applying to the written data. In the current Linux
code, the only effect of MS_INVALIDATE over msync(0) is an extra
validity check that I'm not clear on the purpose of.
> Another point here is that msync(MS_SYNC) starts writeout of _all_ dirty
> pages in the file (as MS_ASYNC used to do) and it waits upon writeback of
> the whole file. That's quite inefficient for an app which has lots of
> threads writing to and msync()ing the same MAP_SHARED file.
Ick.
> We could easily enough convert msync() to only operate on the affected
> region of the (non-linearly-mapped) file. But I don't think we can do that
> now, because people might be relying upon the side-effects.
Um, they shouldn't be. It certainly hasn't been documented. If someone
wants that, they can use fdatasync(). Do you have any reason to believe
that there exist applications that rely on such non-portable behaviour
for correctness? I'd think someone writing such careful code would
carefully follow the guarantees.
> The fadvise() extensions allow us to fix this. And we've needed them for
> some time for regular write()s anyway.
I'm not objecting to them, just to the fact that they're non-portable
extensions needed to make the portable system calls behave in the
standard-defined way.
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>Instead of
>> LINUX_FADV_ASYNC_WRITE
>> LINUX_FADV_WRITE_WAIT
>>
>> can we have something more consistent? Perhaps
>> FADV_WRITE_ASYNC
>> FADV_WRITE_SYNC
>
>
> Nope, I had a bit of a think about this and decided that the two operations
> which we need are:
>
>
Do you need to introduce a completely new concept 'wait upon writeout'
though? Not to say they can't solve the problem but I don't think they
are any more expressive and they definitely depart from the norm which
has always been sync / async AFAIK.
It may be a very useful operation in kernel, but I think userspace either
wants to definitely know the data is on disk (WRITE_SYNC), or give a hint
to start writing (WRITE_ASYNC).
From a kernel implementation point of view, WRITE_SYNC may be doing
several things (start writeout, wait writeout), but from userspace it is
just a single logical operation.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
[email protected] wrote:
>>That's what MS_ASYNC already does.
>
>
> Yes, in violation of the SuS spec. That's what msync(0) already does,
> too, so the linux-specific extension already exists.
>
> The standard description of MS_INVALIDATE is very confusing and poorly
> worded, but I think it's designed for a model where mmap() copies rather
> than playing page table tricks, and the OS has to copy the dirty pages
> back and forth between the buffer cache "by hand". Looked at that way,
> the MS_INVALIDATE wording seems to be intended as something of a "commit
> memory writes back to the file system level" operation.
>
> Which could also be expected to cause the traditional 30-second sync
> timeout to start applying to the written data. In the current Linux
Yes as we already have something that does the pte->page work (I'd agree
with your interpretation of MS_INVALIDATE), then we definitely have room
to make MS_ASYNC more efficient for applications like yours that use it
properly.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Fri, 10 Feb 2006, Nick Piggin wrote:
>
> This is a Linux implementation detail. As such it would make sense to
> introduce a new Linux specific MS_ flag for this.
> ..
> Definitely. And when the app gives us a hint that it really wants the
> data on the disk, starting it as early as possible is also a good
> optimisation.
But that's what MS_SYNC is. MS_SYNC says "I need this data written now".
MS_ASYNC moves it into the page cache. That makes 100% sense. Then it will
be written by the regular dirty page writeout. That makes 100% sense.
> I don't think there's anything wrong with your fadvise additions.
> I'd rather see MS_ASYNC start IO immediately and add another MS_
> flag for Linux to propogate bits.
Why? I miss the _reason_ you want to do this.
The current MS_ASYNC behaviour is the sane one. It's the one that doesn't
cause the harddisk to start ticking senselessly. It's the one that allows
a person on a laptop to say "don't write dirty data every 5 seconds - do
it just every hour".
In contrast, _your_ proposal is just inflexible and inconvenient.
If somebody really really wants to "start flushing data now", then he can
do so, but that actually has absolutely zero to do with "msync()" any
more. A person who wants the flushing to start "now" might want to flush
any random dirty buffers.
Your suggestion is no different from saying "we should make every
'write()' call start the IO". Which is obviously crap.
Linus
On Fri, 10 Feb 2006, Nick Piggin wrote:
>
> It may be a very useful operation in kernel, but I think userspace either
> wants to definitely know the data is on disk (WRITE_SYNC), or give a hint
> to start writing (WRITE_ASYNC).
Only from a _stupid_ user standpoint.
The fact is, "start writing and wait for the result" is fundamentally a
totally broken operation.
Why?
Because a smart user actually would want to do
- start writing this
- start writing that
- start writing that-other-thing
- wait for them all.
The reason synchronous write performance is absolutely disgusting is
exactly that people think "start writing" should be paired up with "wait
for it".
So the kernel internally separates "start writing" and "wait for it" for
very good reasons. Reasons that in no way go away just because you use to
user space.
And yes, there very much is a third operation too: "mark dirty". That's
the _common_ one. That's the fundamental one. That's the one that we use
every single day, without even realizing. The "start writing" and "wait
for it" operations are actually the rare ones.
Linus
Linus Torvalds wrote:
>
> On Fri, 10 Feb 2006, Nick Piggin wrote:
>
>>This is a Linux implementation detail. As such it would make sense to
>>introduce a new Linux specific MS_ flag for this.
>>..
>>Definitely. And when the app gives us a hint that it really wants the
>>data on the disk, starting it as early as possible is also a good
>>optimisation.
>
>
> But that's what MS_SYNC is. MS_SYNC says "I need this data written now".
>
Yes but it is synchronous.
> MS_ASYNC moves it into the page cache. That makes 100% sense. Then it will
> be written by the regular dirty page writeout. That makes 100% sense.
>
MS_INVALIDATE does that (in Linux), the spec is poorly worded but the
intention seems to be that it would push dirty state back into pagecache for
implementations such as ours.
>
>>I don't think there's anything wrong with your fadvise additions.
>>I'd rather see MS_ASYNC start IO immediately and add another MS_
>>flag for Linux to propogate bits.
>
>
> Why? I miss the _reason_ you want to do this.
>
[email protected] has an application (database or logging I think), which
uses MS_SYNC to provide integrity guarantees, however it is possible to do
useful work between the last write to memory and the commit point. MS_ASYNC
is used to start the IO and pipeline work.
> The current MS_ASYNC behaviour is the sane one. It's the one that doesn't
> cause the harddisk to start ticking senselessly. It's the one that allows
> a person on a laptop to say "don't write dirty data every 5 seconds - do
> it just every hour".
>
MS_INVALIDATE
> In contrast, _your_ proposal is just inflexible and inconvenient.
>
Currently MS_ASYNC does the same as MS_INVALIDATE. But it used to start
IO (before 2.5.something), and apparently it does in Solaris as well.
> If somebody really really wants to "start flushing data now", then he can
> do so, but that actually has absolutely zero to do with "msync()" any
> more. A person who wants the flushing to start "now" might want to flush
> any random dirty buffers.
>
I didn't quite understand what you're saying here.
> Your suggestion is no different from saying "we should make every
> 'write()' call start the IO". Which is obviously crap.
>
I think it is quite a bit different. Obviously what you're saying is crap,
but I think there are good arguments for changing MS_ASYNC so it is not
quite so obvious.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Linus Torvalds wrote:
>
> On Fri, 10 Feb 2006, Nick Piggin wrote:
>
>>It may be a very useful operation in kernel, but I think userspace either
>>wants to definitely know the data is on disk (WRITE_SYNC), or give a hint
>>to start writing (WRITE_ASYNC).
>
>
> Only from a _stupid_ user standpoint.
>
> The fact is, "start writing and wait for the result" is fundamentally a
> totally broken operation.
>
No. Userspace has (almost) a transparent pagecache to backing store,
the only time they care about it is data integrity points in which
case they want to know that it is flushed; or performance hints which
might tell the kernel to write them sooner, or later (or other hints).
Wait until writeout has finished is like an implementation detail that
I can't see how it would be ever useful on its own.
> Why?
>
> Because a smart user actually would want to do
>
> - start writing this
> - start writing that
> - start writing that-other-thing
> - wait for them all.
>
No, you are thinking about what the kernel does. Subtle difference. A
smart user wants to:
- start writing this
- start writing that
- start writing that-other-thing
- make sure this that and the other have reached backing store
OK so in effect it is the same thing, but it is better to export the
interface that reflects how the user interacts with pagecache.
WRITE_SYNC obviously does the "wait for them all" (aka ensure they
hit backing store) thing too, right? It performs exactly the same
role that WRITE_WAIT would do in the above example.
> The reason synchronous write performance is absolutely disgusting is
> exactly that people think "start writing" should be paired up with "wait
> for it".
>
> So the kernel internally separates "start writing" and "wait for it" for
> very good reasons. Reasons that in no way go away just because you use to
> user space.
>
They don't go away but they take different forms. "start writing" is
a performance hint. "wait for it" is only ever a part of "send to
backing store" operation.
My proposal isn't really different to Andrew's in terms of functionality
(unless I've missed something), but it is more consistent because it
does not introduce this completely new concept to our userspace API but
rather uses the SYNC/ASYNC distinction like everything else.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> MS_INVALIDATE does that (in Linux),
I don't actually think it does.
In _current_ linux it does. In some other versions, it will have thrown
the dirty data away. Also, it will make subsequent accesses much much more
expensive - and it doesn't work on locked areas.
> the spec is poorly worded but the
> intention seems to be that it would push dirty state back into pagecache for
> implementations such as ours.
As an application writer, you'd be absolutely crazy to depend on that.
Using "msync( .. 0)" _may_ actually work reliably under any Linux version,
but I wouldn't bet on it, and it's quite possible that it does strange
things on other systems. Again, an application writer that uses it would
have to be deranged (or very much a kernel person - I could imagine doing
it myself, but I could _not_ imagine doing it as a non-kernel developer).
> [email protected] has an application (database or logging I think), which
> uses MS_SYNC to provide integrity guarantees, however it is possible to do
> useful work between the last write to memory and the commit point. MS_ASYNC
> is used to start the IO and pipeline work.
So you're saying that there is one application that knows it could use
different semantics?
Now, please enumerate all the applications that use MS_ASYNC and prefer
the current semantics.
When you know that, you have an argument.
In the meantime, you have an example of an application that wants _new_
semantics.
> > The current MS_ASYNC behaviour is the sane one. It's the one that doesn't
> > cause the harddisk to start ticking senselessly. It's the one that allows a
> > person on a laptop to say "don't write dirty data every 5 seconds - do it
> > just every hour".
>
> MS_INVALIDATE
Repeating something doesn't make it so.
> > In contrast, _your_ proposal is just inflexible and inconvenient.
>
> Currently MS_ASYNC does the same as MS_INVALIDATE. But it used to start
> IO (before 2.5.something), and apparently it does in Solaris as well.
Actually, it did _not_ use to start IO.
Then, somebody made it do so, and people eventually screamed, and it was
reverted again.
Go check Linux-2.0 or something. You'll also see the "MS_INVALIDATE means
throw the dirty bit away" behaviour.
The _sane_ semantics are that if you say "MS_INVALIDATE" the dirty bit is
just thrown away. If you say "MS_INVALIDATE | MS_ASYNC", the dirty bit is
saved in the page cache and then the page is unmapped. And MS_SYNC
obviously does the same thing, except it also waits for it.
Those are the the _logically consistent_ semantics. And it's what Linux
historically did. The fact that we now think "MS_INVALIDATE" on its own
should mean "save the dirty state" is because some other broken operating
system does it, and it's sadly the _safer_ thing to do, even if it's
clearly logically not sane. If you invalidate a mapping, you throw it
away, you don't save it.
Gaah.
I took the time to actually unpack 2.0.40. And yes, it does exactly what I
remember it doing. If you pass in MS_INVALIDATE (with no *SYNC flags) it
does:
pte_clear(ptep);
...
if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
free_page(page);
return 0;
}
without ever marking anything dirty.
> > If somebody really really wants to "start flushing data now", then he can do
> > so, but that actually has absolutely zero to do with "msync()" any more. A
> > person who wants the flushing to start "now" might want to flush any random
> > dirty buffers.
>
> I didn't quite understand what you're saying here.
I'm saying that "start flushing now" has _zero_ to do with an mmap.
It's a perfectly valid operation after a _write_ call too - even if you
never mmaped the area at all.
So if somebody wants to start background IO, what has that got to do with
msync()?
Linus
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> No, you are thinking about what the kernel does. Subtle difference. A
> smart user wants to:
>
> - start writing this
> - start writing that
> - start writing that-other-thing
> - make sure this that and the other have reached backing store
>
> OK so in effect it is the same thing, but it is better to export the
> interface that reflects how the user interacts with pagecache.
>
> WRITE_SYNC obviously does the "wait for them all" (aka ensure they
> hit backing store) thing too, right? It performs exactly the same
> role that WRITE_WAIT would do in the above example.
NOOOOOO!
Think about it for a second. Think about the usage case you yourself were
quoting.
The "magic" in IO is "overlapping IO". If you don't get overlapping IO,
your interfaces are broken. End of story.
And WRITE_SYNC _cannot_ do overlapping IO.
It's entirely possible that somebody else (or that very same program) has
dirtied the same pages that you started write-out on earlier. And that is
when "wait for writes to finish" and "WRITE_SYNC" _differ_.
If you want synchronous writes, use synchronous writes. But if you want
asynchronous writes, you do _not_ implement them as "start writes now" and
"write synchronously". You implement them as "start writes now" and "wait
for the writes to have finished".
There's another very specific and important difference: "wait for the
writes" is fundamentally an interruptible and pollable operation, which
means that it's a lot easier to integrate into any system that has to do
other things too. In contrast, WRITE_SYNC is _neither_ easily
interruptible nor pollable.
So WRITE_SYNC has clearly different behaviour. There's a good reason the
kernel internally has "start write" + "wait for write", and I'll repeat:
none of those reasons go away just because you move to user space.
> My proposal isn't really different to Andrew's in terms of functionality
> (unless I've missed something), but it is more consistent because it
> does not introduce this completely new concept to our userspace API but
> rather uses the SYNC/ASYNC distinction like everything else.
Your proposal has two _huge_ downsides:
- it changes semantics, and you have absolutely _no_ idea of who depends
on the performance semantics of the old behaviour. In contrast, I can
tell you that we did it once before, and we reverted it.
- it's not at all consistent. The _current_ behaviour is consistent, and
matches 100% the current behaviour of sync vs async write().
I really don't see the point.
Linus
Arrgh. You're being thick. So I'm going to try to be very clear.
> But that's what MS_SYNC is. MS_SYNC says "I need this data written now".
>
> MS_ASYNC moves it into the page cache. That makes 100% sense. Then it will
> be written by the regular dirty page writeout. That makes 100% sense.
No. MS_ASYNC says "I need the data written now.". MS_SYNC says
"I need the data to have been written." Notice the difference:
one is in the future tense and one is in the past tense.
One is "get to work" and the other is "are you done yet?"
>> I don't think there's anything wrong with your fadvise additions.
>> I'd rather see MS_ASYNC start IO immediately and add another MS_
>> flag for Linux to propogate bits.
> Why? I miss the _reason_ you want to do this.
I believe we all agree that MS_ASYNC is at most a performance hint.
It doesn't give any firm guarantee about when the write will happen, just
"soon". Deleting every msync(MS_ASYNC) from a program cannot make it
buggier, just possibly slower.
Further, among the various discussions, we have identified two possible
cases where someone might want to give a hint to the operating system
that it would be a Good Idea to copy some dirty data to backing storage:
1) The application is done writing the data, and it's just a hint to
the VM system that there's no sense procrastinating. This is purely
as kindness to the VM system; the application never intends check
up on the write with MS_SYNC. The data is still wanted for read,
so MADV_DONTNEED would be inappropraite.
2) The application is going to invoke MS_SYNC some time in the
future and it would appreciate it if the job were already started.
(If you can think of a third, please mention it.)
Moving the data to the page cache addresses #1.
I want to address #2.
As you yourself have pointed out, there are zero promises unless you
follow up with MS_SYNC or equivalent. If you don't, all you're doing
is offering a clue to the VM system. That's a very optional hint;
It'll get around to cleaning the page itself if it needs the space.
But now suppose we *do* follow up with MS_SYNC. In this case,
the hint given by MS_ASYNC is a little more pointed: I am going
to need this write completed soon, so don't delay.
The only thing we can argue about here is the granularity. Is buffering
the hinted data for 5 to 30 seconds to do a bulk write appropriate?
Will it be at least that long before the MS_SYNC request arrives? Or,
as in my application, are times well under 1 second more common?
My opinion is that people don't like waiting 5 seconds for computers
to do their stuff. Not a lot of applications take that long to
generate all the data they're going to.
Now, I'm not saying that both of these can't be useful, and for the
first, just marking the page cache dirty isn't good enough.
But if you read the standard definition of MS_ASYNC, it seems absolutely
crystal "anybody who can't see this is an illiterate moron" clear
that MS_ASYNC is described as useful for use case #2.
If you want to add support for case #1 with a longer timeout and big
batches, then you'll have to add another option. I might point out
that msync() with flags = 0 has done that on Linux for a while.
But if you're providing separate support for both use cases, then
please RTFS and notice which one is closer to the documented
behaviour of MS_ASYNC and thereby deserves the standard flag name.
Here's a quote to help you, from IEEE STS 1003.1-2001:
# When MS_ASYNC is specified, msync() shall return immediately once all
# the write operations are initiated or queued for servicing
You can language lawyer if you like, but when I tell you to "buy a
ticket or join the queue", I expect you to be waiting in the queue to
buy tickets, not some other, slower queue. And I expect you to know
that unless you're deliberately being difficult.
> The current MS_ASYNC behaviour is the sane one. It's the one that doesn't
> cause the harddisk to start ticking senselessly. It's the one that allows
> a person on a laptop to say "don't write dirty data every 5 seconds - do
> it just every hour".
IT's not sane, it's just useless. What application is going to wait
even 5 seconds to follow up with MS_SYNC? Software timeouts are
a bit shorters than the snooze button on your alarm clock.
> In contrast, _your_ proposal is just inflexible and inconvenient.
I can't comment on flexibility, but it's very convenient for a
clearly defined set of applications (which I happen to be maintaining
one of), and has the advantage of being specified in the relevant
Unix standards.
> If somebody really really wants to "start flushing data now", then he can
> do so, but that actually has absolutely zero to do with "msync()" any
> more. A person who wants the flushing to start "now" might want to flush
> any random dirty buffers.
No, they want to flush just the data that they're going to wait on the
completion of the
As I said, it's a poor man's asynchronous I/O. Funn async I/O is
probably more flexible, but isn't wiely deployed yet.
> Your suggestion is no different from saying "we should make every
> 'write()' call start the IO". Which is obviously crap.
NO, dammit! That would be the equivalent of saying that every memory write
to an mmapped page should start the I/O. Which is, indeed, obviously
crap. If you don't have any particular schedule for performing the
write-back, then don't do anything at all! The VM system will clean the
page when it needs the RAM for something else.
The only reason for calling msync(MS_ASYNC) is because I have a deadline
in mind, and I think that for an OS to assume that it doesn't need to take
action on that advance warning for 5 seconds or so grossly overestimating
the time scales at which computers work these dats.
Used as a basic async I/O primitive, MS_ASYNC lets you start multiple
writes, and then you can wait for completion with MS_SYNC without
forcing an execution order on the OS. If the data hasn't been dirtied
in between, MS_SYNC is just waiting for the in-progress I/O to
complete. (You can use MADV_WILLNEED similarly for reads.)
MS_ASYNC is all about performance. That's its only possible use.
Sticking a 5-second delay into a performance hint is the "obviously
crap" in this discussion.
Sheesh!
On Fri, 10 Feb 2006, Linus Torvalds wrote:
>
> So WRITE_SYNC has clearly different behaviour. There's a good reason the
> kernel internally has "start write" + "wait for write", and I'll repeat:
> none of those reasons go away just because you move to user space.
Btw, just to clarify: there _are_ things that do change when you go from
user space to kernel space. It's true that you lose some visibility, and
it's also true that the kernel has more than just "start write" semantics.
So the kernel actually has "start write, but don't wait for stuff that
has IO already pending", and "start write, and if writeback was active on
a re-dirtied page, wait for and re-start it".
I don't know if user space wants quite -that- much choice. The "start
write but ignore busy areas" doesn't actually make sense together with
"wait for it", since you don't know what (if any) you're really waiting
for.
So it's really three operations
- try to start flushing, so that you'll have less work pending later
- start flushing
- wait for any pending flush
[ From a pure "correctness" angle, we could say that "start flushing"
is the same as "wait for pending" + "try to start". However, the "IO
should overlap as much as possible" argument says that that is the
wrong thing to do, since we can start flushing non-pending IO before we
wait for the old pending one ]
Now, most user programs probably don't care one whit.
But I think Andrew's patch makes sense. It exposes the internal kernel
working in a logical fasion for people who do care. Yes, it's
Linux-specific, but hey, so is arguing about the exact semantics of
MS_INVALIDATE (which is version-specific).
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>MS_INVALIDATE does that (in Linux),
>
>
> I don't actually think it does.
>
> In _current_ linux it does. In some other versions, it will have thrown
> the dirty data away. Also, it will make subsequent accesses much much more
> expensive - and it doesn't work on locked areas.
>
>
>> the spec is poorly worded but the
>>intention seems to be that it would push dirty state back into pagecache for
>>implementations such as ours.
>
>
> As an application writer, you'd be absolutely crazy to depend on that.
>
Either the older versions of Linux are totally broken WRT the spec, or
the spec totally broke compatibility. Either way I guess you would be
crazy to depend on that :(
>>[email protected] has an application (database or logging I think), which
>>uses MS_SYNC to provide integrity guarantees, however it is possible to do
>>useful work between the last write to memory and the commit point. MS_ASYNC
>>is used to start the IO and pipeline work.
>
>
> So you're saying that there is one application that knows it could use
> different semantics?
>
> Now, please enumerate all the applications that use MS_ASYNC and prefer
> the current semantics.
>
> When you know that, you have an argument.
>
I must have missed the post where you enumerated all said applications
when changing from 2.4 and 2.5.67 behaviour to current.
> In the meantime, you have an example of an application that wants _new_
> semantics.
>
2.4 semantics, actually. I have an example of a _regression_.
>
>>>The current MS_ASYNC behaviour is the sane one. It's the one that doesn't
>>>cause the harddisk to start ticking senselessly. It's the one that allows a
>>>person on a laptop to say "don't write dirty data every 5 seconds - do it
>>>just every hour".
>>
>>MS_INVALIDATE
>
>
> Repeating something doesn't make it so.
>
But it is so. Why did you change 2.0 semantics so much? Obviously because
it was broken WRT the spec - I can tell you right now there could have been
a whole lot of applications that preferred the semantics of just throwing
out the data because it is faster, so it wasn't that.
If you want to prove me wrong by quoting buggy behaviour from a 7 year old
kernel.... how am I supposed to argue with that?
>
>>>In contrast, _your_ proposal is just inflexible and inconvenient.
>>
>>Currently MS_ASYNC does the same as MS_INVALIDATE. But it used to start
>>IO (before 2.5.something), and apparently it does in Solaris as well.
>
>
> Actually, it did _not_ use to start IO.
>
> Then, somebody made it do so, and people eventually screamed, and it was
> reverted again.
>
> Go check Linux-2.0 or something. You'll also see the "MS_INVALIDATE means
> throw the dirty bit away" behaviour.
>
Sounds like someone else must have screamed in 2.0 because it was buggy
and the behaviour was changed to match standards for 2.4 and AFAIKS 2.2 does
the same (although I'm not so good at reading 2.2 source).
So those people who didn't like it must have been screaming for a long long
time until it was finally changed in 2.5.68. Unfortunately we have someone
else screaming now (and two years ago) because of the most recent change.
> The _sane_ semantics are that if you say "MS_INVALIDATE" the dirty bit is
> just thrown away. If you say "MS_INVALIDATE | MS_ASYNC", the dirty bit is
> saved in the page cache and then the page is unmapped. And MS_SYNC
> obviously does the same thing, except it also waits for it.
>
They may sound sane to you but if you go throwing away the dirty bit
against then standards then it is very broken.
>>>If somebody really really wants to "start flushing data now", then he can do
>>>so, but that actually has absolutely zero to do with "msync()" any more. A
>>>person who wants the flushing to start "now" might want to flush any random
>>>dirty buffers.
>>
>>I didn't quite understand what you're saying here.
>
>
> I'm saying that "start flushing now" has _zero_ to do with an mmap.
>
> It's a perfectly valid operation after a _write_ call too - even if you
> never mmaped the area at all.
>
> So if somebody wants to start background IO, what has that got to do with
> msync()?
>
It seems very obvious to me that it is a hint. If you wer expecting
to call msync(MS_SYNC) at some point, then you could hope that hinting
with msync(MS_ASYNC) at some point earlier might improve its efficiency.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Fri, 10 Feb 2006, [email protected] wrote:
>
> No. MS_ASYNC says "I need the data written now.".
Says you.
I say (and I have a decade of Linux historical behaviour to back it up)
that is says "I'm done, start flushing this out asynchronously like all
the other data I have written".
And yes, there are performance implications. But your claim that "start IO
now" performs better is bogus. It _sometimes_ performs better, but
sometimes performs much worse.
Take an example. You have a 200MB dirty area in a 1GB machine. You do
MS_ASYNC. What do you want to happen?
Do you want IO to be started on all of it? That's going to take quite a
while, and be really nasty for the system. Or do you want it to be
gracefully buffered out, the way we do all normal background writes?
"Performance" is very much not just about how fast it hits the platter.
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>No, you are thinking about what the kernel does. Subtle difference. A
>>smart user wants to:
>>
>>- start writing this
>>- start writing that
>>- start writing that-other-thing
>>- make sure this that and the other have reached backing store
>>
>>OK so in effect it is the same thing, but it is better to export the
>>interface that reflects how the user interacts with pagecache.
>>
>>WRITE_SYNC obviously does the "wait for them all" (aka ensure they
>>hit backing store) thing too, right? It performs exactly the same
>>role that WRITE_WAIT would do in the above example.
>
>
> NOOOOOO!
>
> Think about it for a second. Think about the usage case you yourself were
> quoting.
>
> The "magic" in IO is "overlapping IO". If you don't get overlapping IO,
> your interfaces are broken. End of story.
>
> And WRITE_SYNC _cannot_ do overlapping IO.
>
What do you mean by overlapping?
fadvise(fd, 100, 200, FADV_WRITE_ASYNC);
fadvise(fd, 300, 400, FADV_WRITE_ASYNC);
fadvise(fd, 100, 200, FADV_WRITE_SYNC);
fadvise(fd, 300, 400, FADV_WRITE_SYNC);
Will do exactly the same as Andrew's
fadvise(fd, 100, 200, FADV_ASYNC_WRITE);
fadvise(fd, 300, 400, FADV_ASYNC_WRITE);
fadvise(fd, 100, 200, FADV_WRITE_WAIT);
fadvise(fd, 300, 400, FADV_WRITE_WAIT);
> It's entirely possible that somebody else (or that very same program) has
> dirtied the same pages that you started write-out on earlier. And that is
> when "wait for writes to finish" and "WRITE_SYNC" _differ_.
>
Yeah they do differ but if you are using sync writes then you obviously
have some data integrity requirements and you _know_ who is writing to
your file and when. That's my point. You're thinking kernel mode. The
userspace requirement for sync writes is "this has reached backing store".
> If you want synchronous writes, use synchronous writes. But if you want
> asynchronous writes, you do _not_ implement them as "start writes now" and
> "write synchronously". You implement them as "start writes now" and "wait
> for the writes to have finished".
>
_You_ do, yes. You are a kernel hacker. You implement synchonous writes.
Implementing synchronous writes is what you do.
Userspace does not care. They use synchronous writes to guarantee it has
hit backing store. They've managed quite nicely up until now without having
your implementation details exposed to them (when is a page dirty? when is
it "under writeout"? who cares? I just want to know if it is on backing
store or not).
> There's another very specific and important difference: "wait for the
> writes" is fundamentally an interruptible and pollable operation, which
> means that it's a lot easier to integrate into any system that has to do
> other things too. In contrast, WRITE_SYNC is _neither_ easily
> interruptible nor pollable.
>
It is just as easy as WRITE_WAIT to do both. In the pollable case you
just need another flag to say you don't want to block, same as would
be required for WRITE_WAIT.
Seems like you're clutching for straws here.
> So WRITE_SYNC has clearly different behaviour. There's a good reason the
> kernel internally has "start write" + "wait for write", and I'll repeat:
> none of those reasons go away just because you move to user space.
>
>
>>My proposal isn't really different to Andrew's in terms of functionality
>>(unless I've missed something), but it is more consistent because it
>>does not introduce this completely new concept to our userspace API but
>>rather uses the SYNC/ASYNC distinction like everything else.
>
>
> Your proposal has two _huge_ downsides:
>
I was still talking about new additions to fadvise here, not the msync stuff.
> - it changes semantics, and you have absolutely _no_ idea of who depends
> on the performance semantics of the old behaviour. In contrast, I can
> tell you that we did it once before, and we reverted it.
>
> - it's not at all consistent. The _current_ behaviour is consistent, and
> matches 100% the current behaviour of sync vs async write().
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> It seems very obvious to me that it is a hint. If you wer expecting
> to call msync(MS_SYNC) at some point, then you could hope that hinting
> with msync(MS_ASYNC) at some point earlier might improve its efficiency.
And it will. MS_ASYNC tells the system about dirty pages. It _should_
actually initiate writeback if the system decides that it has lots of
dirty pages. Of course, if the system doesn't have a lot of dirty pages,
the kernel will decide that no writeback is necessary.
If you (as an application) know that you will wait for the IO later (which
is _not_ what MS_ASYNC talks about), why don't you just start it?
ie what's wrong with Andrew's patch which is what I also encourage?
I contend that "mmap + MS_ASYNC" should work as "write()". That's just
_sensible_.
Btw, you can equally well make the argument that "write()" is a hint that
we should start IO, so that if we do fdatasync() later, it will finish
more quickly. It's _true_. It just isn't the whole truth. It makes things
_slowe_ if you don't do fdatasync(), the same way you can do MS_ASYNC
without doing MS_SYNC afterwards.
Now, if your argument is more general, aka "we should do better at
writeback in general", I actually wouldn't disagree. We probably _should_
do better at write-back. The "sync every five seconds" causes pulses of
(efficient) IO, but it also allows for lots of dirty stuff to have
collected for no good reason, and causes bad IO latency for reads when it
happens.
So if you were to argue _in_general_ for smoother write-back, I wouldn't
actually object at all. I think it would potentially make much sense to
make both "write()" _and_ things like msync(MS_ASYNC) perhaps see if the
IO queue has been idle for a second, and if so, start trickling writes
out.
I bet that would be lovely. I hate how un-tarring a big tree tends to have
these big hickups, and "vmstat 1" shows that the disk isn't even writing
all the time until half-way through the "untar".
IOW, I think you could re-phrase your argument in a more generic way, and
I might well _agree_ with it. I just don't think it has anything to do
with MS_ASYNC _in_particular_.
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>It seems very obvious to me that it is a hint. If you wer expecting
>>to call msync(MS_SYNC) at some point, then you could hope that hinting
>>with msync(MS_ASYNC) at some point earlier might improve its efficiency.
>
>
> And it will. MS_ASYNC tells the system about dirty pages. It _should_
> actually initiate writeback if the system decides that it has lots of
> dirty pages. Of course, if the system doesn't have a lot of dirty pages,
> the kernel will decide that no writeback is necessary.
>
> If you (as an application) know that you will wait for the IO later (which
> is _not_ what MS_ASYNC talks about), why don't you just start it?
>
It depends how you interpret the standards and what you think sensible
behaviour would be, I guess (obviously our current MS_ASYNC is not
technically buggy, we're arguing about whether or not it is suboptimal).
But given that there is an MS_INVALIDATE (I interpret mmap + MS_INVALIDATE
should work as write()), and that one would _expect_ MS_ASYNC to closely
match MS_SYNC, I think MS_ASYNC should start writeout straight away.
The fact that we've historically had a buggy MS_INVALIDATE implementation
is a non argument when it comes to the interpretation of the standards.
> ie what's wrong with Andrew's patch which is what I also encourage?
>
> I contend that "mmap + MS_ASYNC" should work as "write()". That's just
> _sensible_.
>
> Btw, you can equally well make the argument that "write()" is a hint that
> we should start IO, so that if we do fdatasync() later, it will finish
> more quickly. It's _true_. It just isn't the whole truth. It makes things
> _slowe_ if you don't do fdatasync(), the same way you can do MS_ASYNC
> without doing MS_SYNC afterwards.
>
I wouldn't argue that because I don't agree with your contention. I
argue that MS_ASYNC should do as much of the work of MS_SYNC as possible,
without blocking.
From the standard (msync):
Description
The msync() function shall write all modified data to permanent storage
locations...
When MS_ASYNC is specified, msync() shall return immediately once all
the write operations are initiated or queued for servicing;
It is talking about write operations, not dirtying. Actually the only
difference with MS_SYNC is that it waits for said write operations (of the
type queued up by MS_ASYNC) to complete.
So our current MS_ASYNC behaviour might technically not violate a standard
(depending on what you consider initiating / queueing writes), but it would
be akin to having MS_SYNC waiting for pages to become clean without actually
starting the writeout either (which is likewise inefficient but technically
correct).
[snip smooth writeback]
That would be a nice thing yes, but again I don't agree that MS_ASYNC
is semantically equivalent to write()
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> What do you mean by overlapping?
I'm just talking about the "same area gets re-dirtied while it's already
busy being written". Depending on the _program_, this:
- never happens in practice
- is very common
- should just leave the page dirty
- should always start a new IO (waiting for the old one first, or use a
barrier if you want to be fancy).
Let's do a more hands-on example, just to make it less abstract.
- let's say that you have some kind of file-backed storage, and you
basically want to let the kernel know about your modifications, so that
it can DTRT (and let's ignore what the "right thing" is for a moment)
- The "dirty" bit very fundamentally is obviously at a page granularity,
but your data may well be at a much finer granularity. In particular,
your data may be a log that keeps growing.
- So let's say that you append to the log, and chose (for some reason,
never mind) to let the kernel know. So you effectively do something
like
memcpy(logptr, newentry, newentrysize);
logptr = logptr + newentrysize;
if (time_to_msync) {
msync(msyncptr, logptr - msyncptr, MS_ASYNC);
msyncptr = logptr;
}
Ok?
Now, the question is, what do we want to happen at the MS_ASYNC.
In particular, what happens if the _previous_ MS_ASYNC had started the IO
(either directly, like in your world, or by bdflush just picking it up,
it really doesn't matter) on the _previous_ old end of the log area, so
the partial page at the old "msyncptr" point may actually be under IO
still.
We have multiple choices:
- we ignore the issue (which is what the current behaviour for MS_ASYNC
is, since it just marks things dirty in the page cache)
- we mark the page dirty, but we don't start IO on it, since it's busy
(and since it's dirty, it will _eventually_ get written out)
- we actually wait for the old IO, in order to start IO on it again.
Now, I don't think that the third option is sane for MS_ASYNC (ie I don't
think even you want -that- behaviour), but in general, all these three
choices are actually sane. Notice how none of them actually involve
waiting for the new _result_. It's only a question about whether to wait
for an old write when we start a new one, or leave the new one entirely
_unstarted_.
> fadvise(fd, 100, 200, FADV_ASYNC_WRITE);
> fadvise(fd, 300, 400, FADV_ASYNC_WRITE);
> fadvise(fd, 100, 200, FADV_WRITE_WAIT);
> fadvise(fd, 300, 400, FADV_WRITE_WAIT);
I'm saying that a valid pattern is
.. dirty offset 100-200 ..
fadvice(fd, 100, 200, FADV_WRITE_START_TRY);
.. dirty offset 200-300 ..
fadvice(fd, 200, 300, FADV_WRITE_START_TRY);
.. dirty offset 300-400 ..
fadvice(fd, 300, 400, FADV_WRITE_START_TRY);
is a valid thing to do ("try to start IO, but don't guarantee it") as a
way to get things going. But that would never pair up with a "wait for
IO", because there's no guarantee that the IO got started (for example, we
may have started the IO when only bytes 100-200 were dirty, then we
dirtied the other bytes, but we didn't re-start the IO for them because
the previous IO to the same page was still pending, so the bytes never hit
storage and they aren't even outstanding).
But the "FADV_WRITE_START_TRY" is actually the best thing if what you are
trying to do is to keep changes _minimal_ so that when you later acutally
finish the whole thing, you can do
fadvice(fd, 100, 400, FADV_WRITE_WAIT);
which is your "write and wait".
So far so good, and we don't actually care. The unconditional "write and
wait" at the end means that it's irrelevant whether the "START_TRY" thing
actually started the IO or not - the START_TRY thing _can_ be a no-op if
you want to.
These sound like the semantics you want. No?
And yes, I'm perfectly happy with them. I think this is what people would
do. I just wanted to make sure that we're AWARE of the fact that it
implies that the ASYNC thing wouldn't necessarily always even start IO.
And the reason I wanted to make sure of that is that the whole thread
started from you complaining about MS_ASYNC not starting the IO. I'm
saying that if you _require_ starting of IO, then the FADV_WRITE_WAIT
actually sensibly has different semantics, which can be a lot cheaper to
do in the presense of other writers (ie then the write-wait would only
need to wait for any outstanding IO, not start writing out stuff that
somebody else had written).
And the reason I wanted to take up the semantic difference is because
there _are_ semantic differences.
If you only "commit" things when you have nothing dangling, you'll see the
above patterns. But it's a valid thing to commit things after you've made
"further" log changes (that you're _not_ ready to commit). For example,
say that your log is really dirtying all the time, but you synchronize it
at certain points and write the pointer to the synchronized state
somewhere else. What would you do?
Your pattern would actually be
.. dirty offset 100-200 ..
fadvice(fd, 100, 200, FADV_WRITE_START);
.. dirty offset 200-300 ..
fadvice(fd, 200, 300, FADV_WRITE_START);
.. dirty offset 300-400 ..
fadvice(fd, 300, 400, FADV_WRITE_START);
.. dirty offset 400-415 .. (for the next transaction)
fadvice(fd, 100, 400, FADV_JUST_WAIT); (for the previous one)
and here is where the semantics differ. The "always start IO, and just
wait for IO" won't be waiting for the partial stuff (that doesn't matter).
While the "write and wait" would synchronously write stuff that we just
don't care about (just because they happen to be on the same "IO
granularity" block).
This "unconditional write start" + "unconditional wait only" pattern in
theory allows you to optimize all the IO patterns by hand, and have less
synchronous waits, because it wouldn't wait for state that is dirty, but
that doesn't matter.
But as long as people are _aware_ of this issue, I don't much care.
Linus
Linus Torvalds wrote:
>
> On Fri, 10 Feb 2006, [email protected] wrote:
>
>>No. MS_ASYNC says "I need the data written now.".
>
>
> Says you.
>
> I say (and I have a decade of Linux historical behaviour to back it up)
> that is says "I'm done, start flushing this out asynchronously like all
> the other data I have written".
>
> And yes, there are performance implications. But your claim that "start IO
> now" performs better is bogus. It _sometimes_ performs better, but
> sometimes performs much worse.
>
> Take an example. You have a 200MB dirty area in a 1GB machine. You do
> MS_ASYNC. What do you want to happen?
>
It quite obviously depends on the context in which one is using it,
which will depend on what one expects it to do (unless one is an idiot).
If [email protected]'s[1] database has dirtied 200MB of data and
knows it will not dirty it again and has several hundred ms of useful
work to do before it must call MS_SYNC, then...
> Do you want IO to be started on all of it?
... yes.
[1] Come on, linux, can you at least make up a name for me, or are
you really called Linux? (in which case you'd better make up a
new name anyway when arguing with Linus about Linux, for the
sake of everyone's sanity)
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> When MS_ASYNC is specified, msync() shall return immediately once all
> the write operations are initiated or queued for servicing;
>
> It is talking about write operations, not dirtying. Actually the only
> difference with MS_SYNC is that it waits for said write operations (of the
> type queued up by MS_ASYNC) to complete.
Right. And it's what we do. We queue them by moving the pages to the dirty
lists (yeah, it's just a tag on the page index thing, whatever).
And yes, you argue that we should move the queue closer to the actual
disk, but I have used at least one app that really hated the "start IO
now" approach. I can't talk about that app in any detail, but I can say
that it was an in-memory checkpoint thing with the checkpoints easily
being in the hundred-meg range.
And moving a hundred megs to the IO layer is insane. It also makes the
system pretty unusable.
So we may have different expectations, because we've seen different
patterns. Me, I've seen the "events are huge, and you stagger them", so
that the previous event has time to flow out to disk while you generate
the next one. There, MS_ASYNC starting IO is _wrong_, because the scale of
the event is just huge, so trying to push it through the IO subsystem asap
just makes everything suck.
In contrast, you seem to be coming at it from a standpoint of "only one
event ever outstanding at any particular time, and it's either small or
it's the only thing the whole system is doing". In which case pushing it
out to IO buffers is probably the right thing to do.
The reason I like the current MS_ASYNC is that it _allows_ both. Once you
push it to the page cache, you can choose to push it closer to the IO path
if you want to. In contrast, if MS_ASYNC pushes it directly into the IO
queues, you're screwed. You can't take it back. You don't have any choice.
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>What do you mean by overlapping?
>
>
> I'm just talking about the "same area gets re-dirtied while it's already
> busy being written". Depending on the _program_, this:
OK, that's what I thought.
> - So let's say that you append to the log, and chose (for some reason,
> never mind) to let the kernel know. So you effectively do something
> like
>
> memcpy(logptr, newentry, newentrysize);
> logptr = logptr + newentrysize;
> if (time_to_msync) {
> msync(msyncptr, logptr - msyncptr, MS_ASYNC);
> msyncptr = logptr;
> }
>
> Ok?
>
> Now, the question is, what do we want to happen at the MS_ASYNC.
>
Well it is all very well to just make up this case but I don't
see what it proves (eg. some application may actually work better
if creat is implemented with unlink to be extreme). But I'll try
to humour you :)
Being a logging program it would appear to have some data integrity
requirements, and as such I would guess it is going to use MS_SYNC
in the very near future before writing another entry to the log
(in case a crash happens while generating the next entry).
However the fact the MS_ASYNC is even called in the first place
indicates to me that there must be some window before the MS_SYNC
point (for whatever reason). So I would really want MS_ASYNC to
actually send the page to backing store asap in order to get some
pipelining going.
> In particular, what happens if the _previous_ MS_ASYNC had started the IO
> (either directly, like in your world, or by bdflush just picking it up,
> it really doesn't matter) on the _previous_ old end of the log area, so
> the partial page at the old "msyncptr" point may actually be under IO
> still.
>
I wouldn't expect any IO there at all because there would be no
"dangling" MS_ASYNC, and there would be no random clowns writing
to our very important log. But just on the off chance that there
was some IO going on:
> We have multiple choices:
> - we ignore the issue (which is what the current behaviour for MS_ASYNC
> is, since it just marks things dirty in the page cache)
> - we mark the page dirty, but we don't start IO on it, since it's busy
> (and since it's dirty, it will _eventually_ get written out)
> - we actually wait for the old IO, in order to start IO on it again.
>
> Now, I don't think that the third option is sane for MS_ASYNC (ie I don't
> think even you want -that- behaviour), but in general, all these three
> choices are actually sane. Notice how none of them actually involve
> waiting for the new _result_. It's only a question about whether to wait
> for an old write when we start a new one, or leave the new one entirely
> _unstarted_.
>
3 is obviously wrong because it blocks. 1 is what we have now which
I'm arguing against (on efficiency grounds). So that leaves us with 2,
which is an acceptable compromise for a situation which isn't likely
to come up much with a well coded app.
>
>>fadvise(fd, 100, 200, FADV_ASYNC_WRITE);
>>fadvise(fd, 300, 400, FADV_ASYNC_WRITE);
>>fadvise(fd, 100, 200, FADV_WRITE_WAIT);
>>fadvise(fd, 300, 400, FADV_WRITE_WAIT);
>
>
> I'm saying that a valid pattern is
>
> .. dirty offset 100-200 ..
> fadvice(fd, 100, 200, FADV_WRITE_START_TRY);
>
> .. dirty offset 200-300 ..
> fadvice(fd, 200, 300, FADV_WRITE_START_TRY);
>
> .. dirty offset 300-400 ..
> fadvice(fd, 300, 400, FADV_WRITE_START_TRY);
>
> is a valid thing to do ("try to start IO, but don't guarantee it") as a
> way to get things going. But that would never pair up with a "wait for
> IO", because there's no guarantee that the IO got started (for example, we
> may have started the IO when only bytes 100-200 were dirty, then we
> dirtied the other bytes, but we didn't re-start the IO for them because
> the previous IO to the same page was still pending, so the bytes never hit
> storage and they aren't even outstanding).
>
> But the "FADV_WRITE_START_TRY" is actually the best thing if what you are
> trying to do is to keep changes _minimal_ so that when you later acutally
> finish the whole thing, you can do
>
> fadvice(fd, 100, 400, FADV_WRITE_WAIT);
>
> which is your "write and wait".
>
[argh! that was actually Andrew's "wait for writeout", but OK ;)]
> So far so good, and we don't actually care. The unconditional "write and
> wait" at the end means that it's irrelevant whether the "START_TRY" thing
> actually started the IO or not - the START_TRY thing _can_ be a no-op if
> you want to.
>
> These sound like the semantics you want. No?
>
Yes (and I'd likewise argue that an efficient FADV_WRITE_START_TRY
implementation should really try to get IO going. Ie. exactly what
I'm arguing for MS_ASYNC).
> And yes, I'm perfectly happy with them. I think this is what people would
> do. I just wanted to make sure that we're AWARE of the fact that it
> implies that the ASYNC thing wouldn't necessarily always even start IO.
>
True. I believe our MS_ASYNC is technically within the standards.
I think it is suboptimal for sane users and against the spirit
of the spec.
> And the reason I wanted to make sure of that is that the whole thread
> started from you complaining about MS_ASYNC not starting the IO. I'm
> saying that if you _require_ starting of IO, then the FADV_WRITE_WAIT
> actually sensibly has different semantics, which can be a lot cheaper to
> do in the presense of other writers (ie then the write-wait would only
> need to wait for any outstanding IO, not start writing out stuff that
> somebody else had written).
>
> And the reason I wanted to take up the semantic difference is because
> there _are_ semantic differences.
>
> If you only "commit" things when you have nothing dangling, you'll see the
> above patterns. But it's a valid thing to commit things after you've made
> "further" log changes (that you're _not_ ready to commit). For example,
I don't think so in general because userspace can't guarantee something
*is not* sent to backing store, only that it *is*.
> say that your log is really dirtying all the time, but you synchronize it
> at certain points and write the pointer to the synchronized state
> somewhere else. What would you do?
>
> Your pattern would actually be
>
> .. dirty offset 100-200 ..
> fadvice(fd, 100, 200, FADV_WRITE_START);
>
> .. dirty offset 200-300 ..
> fadvice(fd, 200, 300, FADV_WRITE_START);
>
> .. dirty offset 300-400 ..
> fadvice(fd, 300, 400, FADV_WRITE_START);
>
> .. dirty offset 400-415 .. (for the next transaction)
>
- IOW if the app or OS crashed here it would be possible to see 400-415 on
the disk and none of the previous transactions (assuming we don't know
the page size).
- If you are saying that the app does know the page size, then it is
obvious that it is by no stretch of the imagination hand optimising
IO, because it will have started 3 different IOs for the same page.
- Or (final option) only the first fadvise started IO, then any or all
of the subsequent transactions might not be synched after FADV_JUST_WAIT
(depending on what the DMA to disk saw).
> fadvice(fd, 100, 400, FADV_JUST_WAIT); (for the previous one)
>
> and here is where the semantics differ. The "always start IO, and just
> wait for IO" won't be waiting for the partial stuff (that doesn't matter).
> While the "write and wait" would synchronously write stuff that we just
> don't care about (just because they happen to be on the same "IO
> granularity" block).
>
> This "unconditional write start" + "unconditional wait only" pattern in
> theory allows you to optimize all the IO patterns by hand, and have less
> synchronous waits, because it wouldn't wait for state that is dirty, but
> that doesn't matter.
>
I'm not convinced. You above example was bogus.
> But as long as people are _aware_ of this issue, I don't much care.
>
> Linus
>
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Am Freitag, 10. Februar 2006 20:05 schrieb Linus Torvalds:
> So we may have different expectations, because we've seen different
> patterns. Me, I've seen the "events are huge, and you stagger them", so
> that the previous event has time to flow out to disk while you generate
> the next one. There, MS_ASYNC starting IO is _wrong_, because the scale of
> the event is just huge, so trying to push it through the IO subsystem asap
> just makes everything suck.
Isn't the benefit of starting writing immediately greater the smaller
the area in question? If so, couldn't a heuristic be found to decide whether
to initiate IO at once?
Oliver
On Sat, 11 Feb 2006, Nick Piggin wrote:
> >
> > Your pattern would actually be
> >
> > .. dirty offset 100-200 ..
> > fadvice(fd, 100, 200, FADV_WRITE_START);
> >
> > .. dirty offset 200-300 ..
> > fadvice(fd, 200, 300, FADV_WRITE_START);
> >
> > .. dirty offset 300-400 ..
> > fadvice(fd, 300, 400, FADV_WRITE_START);
> >
> > .. dirty offset 400-415 .. (for the next transaction)
> >
>
> - IOW if the app or OS crashed here it would be possible to see 400-415 on
> the disk and none of the previous transactions (assuming we don't know
> the page size).
If the app/OS crashed here, nothing would matter. We haven't committed
anything at all yet. We've just started the IO. What is at 400-415 simply
doesn't matter, because nobody would have any reason to look at it.
(Besides, it's not at all clear that 400-415 would or would not be on
disk. It depends on entirely on timing and buffering of the IO system at
that point - the fact that its dirty in memory doesn't mean that it ever
made it into the IO buffer that was started).
> > fadvice(fd, 100, 400, FADV_JUST_WAIT); (for the previous one)
This is the one that waits for it to finish, so _now_ we can update the
pointers (elsewhere) to that log (and if the app/OS crashes before that,
nobody will even know about it).
See?
> I'm not convinced. You above example was bogus.
No, your understanding was incomplete. I'm talking about just parts of a
much bigger transaction.
A single write on its own is almost never a transaction unless your system
is _purely_ log-based (which it could be, of course. Not in my example).
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>>Your pattern would actually be
>>>
>>> .. dirty offset 100-200 ..
>>> fadvice(fd, 100, 200, FADV_WRITE_START);
>>>
>>> .. dirty offset 200-300 ..
>>> fadvice(fd, 200, 300, FADV_WRITE_START);
>>>
>>> .. dirty offset 300-400 ..
>>> fadvice(fd, 300, 400, FADV_WRITE_START);
>>>
>>> .. dirty offset 400-415 .. (for the next transaction)
>>>
>>
>>- IOW if the app or OS crashed here it would be possible to see 400-415 on
>>the disk and none of the previous transactions (assuming we don't know
>>the page size).
>
>
> If the app/OS crashed here, nothing would matter. We haven't committed
> anything at all yet. We've just started the IO. What is at 400-415 simply
> doesn't matter, because nobody would have any reason to look at it.
>
> (Besides, it's not at all clear that 400-415 would or would not be on
> disk. It depends on entirely on timing and buffering of the IO system at
> that point - the fact that its dirty in memory doesn't mean that it ever
> made it into the IO buffer that was started).
>
>
>>> fadvice(fd, 100, 400, FADV_JUST_WAIT); (for the previous one)
>
>
> This is the one that waits for it to finish, so _now_ we can update the
> pointers (elsewhere) to that log (and if the app/OS crashes before that,
> nobody will even know about it).
>
> See?
>
Well in that case in your argument your FADV_WRITE_START is of
the "waits for writeout then starts writeout if dirty" type.
In which case you've just made 3 consecutive write+wait cycles
to the same page, so it is hardly an optimal IO pattern.
>
>>I'm not convinced. You above example was bogus.
>
>
> No, your understanding was incomplete. I'm talking about just parts of a
> much bigger transaction.
>
> A single write on its own is almost never a transaction unless your system
> is _purely_ log-based (which it could be, of course. Not in my example).
>
You were saying that your above sequence would be more efficient
if implemented with "always start IO, and just wait for IO", because
"write and wait" would do 2 write+wait cycles.
However "always start IO, and just wait for IO" does 3 write+wait cycles.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Fri, 10 Feb 2006, Oliver Neukum wrote:
>
> Am Freitag, 10. Februar 2006 20:05 schrieb Linus Torvalds:
> > So we may have different expectations, because we've seen different
> > patterns. Me, I've seen the "events are huge, and you stagger them", so
> > that the previous event has time to flow out to disk while you generate
> > the next one. There, MS_ASYNC starting IO is _wrong_, because the scale of
> > the event is just huge, so trying to push it through the IO subsystem asap
> > just makes everything suck.
>
> Isn't the benefit of starting writing immediately greater the smaller
> the area in question? If so, couldn't a heuristic be found to decide whether
> to initiate IO at once?
Quite possibly. I suspect you could/should take other issues into account
too (like whether the queue to the device is busy or bdflush is already
working).
I wouldn't object to that.
Linus
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> Well in that case in your argument your FADV_WRITE_START is of
> the "waits for writeout then starts writeout if dirty" type.
>
> In which case you've just made 3 consecutive write+wait cycles
> to the same page, so it is hardly an optimal IO pattern.
The point is, this is the interface that an app would want to use if they
want _perfect_ IO patterns.
Obviously, such an app wouldn't do writes every 100 bytes (or would do
them only if it knows that enough time has passed that the previous IO
will be done - but it can't _risk_ dropping an IO if something strange
happens).
The point being the ".. it might have dirtied the page since it's last
WRITE_START" thing. That's where it can very validly basically say "ok, I
now need for my last write to have finished, but I don't care about the
fact that I've made other changes _since_ in that same page". See?
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>> When MS_ASYNC is specified, msync() shall return immediately once all
>> the write operations are initiated or queued for servicing;
>>
>>It is talking about write operations, not dirtying. Actually the only
>>difference with MS_SYNC is that it waits for said write operations (of the
>>type queued up by MS_ASYNC) to complete.
>
>
> Right. And it's what we do. We queue them by moving the pages to the dirty
> lists (yeah, it's just a tag on the page index thing, whatever).
>
> And yes, you argue that we should move the queue closer to the actual
> disk, but I have used at least one app that really hated the "start IO
> now" approach. I can't talk about that app in any detail, but I can say
> that it was an in-memory checkpoint thing with the checkpoints easily
> being in the hundred-meg range.
>
Hey fix your damn broken proprietary app (nah just kidding)
> And moving a hundred megs to the IO layer is insane. It also makes the
> system pretty unusable.
>
> So we may have different expectations, because we've seen different
> patterns. Me, I've seen the "events are huge, and you stagger them", so
> that the previous event has time to flow out to disk while you generate
> the next one. There, MS_ASYNC starting IO is _wrong_, because the scale of
> the event is just huge, so trying to push it through the IO subsystem asap
> just makes everything suck.
>
> In contrast, you seem to be coming at it from a standpoint of "only one
> event ever outstanding at any particular time, and it's either small or
> it's the only thing the whole system is doing". In which case pushing it
> out to IO buffers is probably the right thing to do.
>
The way I see it, it stems from simply a different expectation of
MS_ASYNC semantics, rather than exactly what the app is doing.
If there are no data integrity requirements, then the writing should
be left up to the VM. If there are, then there will be a MS_SYNC,
which *will* move those hundred megs to the IO layer so there is no
reason for MS_ASYNC *not* to get it started earlier (and it will
be more efficient if it does).
The semantics your app wants, in my interpretation, are provided
by MS_INVALIDATE. Which kind of says "bring mmap data into coherence
with system cache", which would presumably transfer dirty bits if
modified (though as an implementation detail, we are never actually
incoherent as far as the data goes, only dirty bits).
At this point the best I can do is agree to disagree if you are
still not convinced and I'll leave it to Linux to keep debating it.
We reached something of an agreement on the fadvise thing at least.
Thanks,
Nick
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
Linus Torvalds <[email protected]> wrote:
>
>
>
> On Fri, 10 Feb 2006, Oliver Neukum wrote:
> >
> > Am Freitag, 10. Februar 2006 20:05 schrieb Linus Torvalds:
> > > So we may have different expectations, because we've seen different
> > > patterns. Me, I've seen the "events are huge, and you stagger them", so
> > > that the previous event has time to flow out to disk while you generate
> > > the next one. There, MS_ASYNC starting IO is _wrong_, because the scale of
> > > the event is just huge, so trying to push it through the IO subsystem asap
> > > just makes everything suck.
> >
> > Isn't the benefit of starting writing immediately greater the smaller
> > the area in question? If so, couldn't a heuristic be found to decide whether
> > to initiate IO at once?
>
> Quite possibly. I suspect you could/should take other issues into account
> too (like whether the queue to the device is busy or bdflush is already
> working).
>
Yes, it would make sense to run balance_dirty_pages_ratelimited() inside
msync_pte_range(). So pdflush will get poked if we hit
background_dirty_ratio threshold, or we go into caller-initiated writeout
if we hit dirty_ratio.
But it's not completely trivial, because I don't think we want to be doing
blocking writeback with mmap_sem held.
The code under balance_dirty_pages() does pay attention to queue congestion
states, already-under-writeback pages and such things, but it could be
better, I guess. Starting some writeback earlier if the queue is deemed to
be idle could work.
(Hi, Stephen)
On Sat, 11 Feb 2006, Nick Piggin wrote:
>
> The way I see it, it stems from simply a different expectation of
> MS_ASYNC semantics, rather than exactly what the app is doing.
>
> If there are no data integrity requirements, then the writing should
> be left up to the VM. If there are, then there will be a MS_SYNC,
> which *will* move those hundred megs to the IO layer so there is no
> reason for MS_ASYNC *not* to get it started earlier (and it will
> be more efficient if it does).
Yes, largely.
> The semantics your app wants, in my interpretation, are provided
> by MS_INVALIDATE. Which kind of says "bring mmap data into coherence
> with system cache", which would presumably transfer dirty bits if
> modified (though as an implementation detail, we are never actually
> incoherent as far as the data goes, only dirty bits).
This historical meaning as far as I can tell, for MS_INVALIDATE really
_forgets_ the old mmap'ped contents in a non-coherent system.
Quoting from a UNIX man-page (as found by google):
...
If flags is MS_INVALIDATE, the function synchronizes the
contents of the memory region to match the current file
contents.
o All writes to the mapped portion of the file made
prior to the call are visible by subsequent read
references to the mapped memory region.
o All write references prior to the call, by any pro-
cess, to memory regions mapped to the same portion of
the file using MAP_SHARED, are visible by read refer-
ences to the region.
...
now, it's confusing, but I read that as meaning that the mmap'ed region is
literally thrown away, and that anybody who has done a "write()" call will
have their recently written data show up. That's also what the naming
("invalidate") suggests.
In a non-coherent system (and remember, that's what old UNIX was, when
MS_INVALIDATE came to be), you -cannot- reasonably synchronize your caches
any other way than by throwing away your own cached copy.
(Think non-coherent CPU caches in the old non-coherent NUMA machines that
happily nobody makes any more - same exact deal. The cache ops are either
"writeback" or "throw away" or a combination of the two.)
So I don't think MS_INVALIDATE has ever really meant what you say it
means: it certainly hasn't meant it in Linux, and it cannot really have
meant it in old UNIX either because the kind of op that you imply of a
two-way coherency simply wasn't _possible_ in original unix..
Now, the "msync(0)" case _could_ very sanely mean "just synchronize with
the page cache".
Linus
On Fri, 10 Feb 2006, Andrew Morton wrote:
>
> Yes, it would make sense to run balance_dirty_pages_ratelimited() inside
> msync_pte_range(). So pdflush will get poked if we hit
> background_dirty_ratio threshold, or we go into caller-initiated writeout
> if we hit dirty_ratio.
>
> But it's not completely trivial, because I don't think we want to be doing
> blocking writeback with mmap_sem held.
Why not just do it once, at the end?
Linus
Linus Torvalds <[email protected]> wrote:
>
>
>
> On Fri, 10 Feb 2006, Andrew Morton wrote:
> >
> > Yes, it would make sense to run balance_dirty_pages_ratelimited() inside
> > msync_pte_range(). So pdflush will get poked if we hit
> > background_dirty_ratio threshold, or we go into caller-initiated writeout
> > if we hit dirty_ratio.
> >
> > But it's not completely trivial, because I don't think we want to be doing
> > blocking writeback with mmap_sem held.
>
> Why not just do it once, at the end?
>
We could, sort-of.
balance_dirty_pages() is quite CPU-intensive (hence the presence of
balance_dirty_pages_ratelimited()).
balance_dirty_pages_ratelimited() expects to be called once per
page-dirtying.
- We can't use balance_dirty_pages() because workloads which do lots of
teeny msyncs would chew lots of CPU.
- We can't use balance_dirty_pages_ratelimited() because it thinks only a
single page was dirtied.
So the thing to do is to change msync to keep track of how many pages were
dirtied, then at the end call
balance_dirty_pages_ratelimited_new_improved_api(mapping, nr_pages_dirtied).
Except an msync can cover multiple mappings, so we'd need to pop the lock
in the top-level loop, run the above for each VMA. Not rocket-science, I
guess.
On Fri, 2006-02-10 at 13:10 -0800, Linus Torvalds wrote:
> This historical meaning as far as I can tell, for MS_INVALIDATE really
> _forgets_ the old mmap'ped contents in a non-coherent system.
>
> Quoting from a UNIX man-page (as found by google):
>
> ...
>
> If flags is MS_INVALIDATE, the function synchronizes the
> contents of the memory region to match the current file
> contents.
>
> o All writes to the mapped portion of the file made
> prior to the call are visible by subsequent read
> references to the mapped memory region.
>
> o All write references prior to the call, by any pro-
> cess, to memory regions mapped to the same portion of
> the file using MAP_SHARED, are visible by read refer-
> ences to the region.
The Single Unix Spec appears to have a very different interpretation.
See http://www.opengroup.org/onlinepubs/009695399/toc.htm
When MS_ASYNC is specified, msync() shall return immediately
once all the write operations are initiated or queued for
servicing; when MS_SYNC is specified, msync() shall not return
until all write operations are completed as defined for
synchronized I/O data integrity completion. Either MS_ASYNC or
MS_SYNC is specified, but not both.
When MS_INVALIDATE is specified, msync() shall invalidate all
cached copies of mapped data that are inconsistent with the
permanent storage locations such that subsequent references
shall obtain data that was consistent with the permanent storage
locations sometime between the call to msync() and the first
subsequent memory reference to the data.
If msync() causes any write to a file, the file's st_ctime and
st_mtime fields shall be marked for update.
Cheers,
Trond
On Fri, 10 Feb 2006, Trond Myklebust wrote:
>
> The Single Unix Spec appears to have a very different interpretation.
Hmm. Very different wording, but same meaning, I think.
> When MS_INVALIDATE is specified, msync() shall invalidate all
> cached copies of mapped data that are inconsistent with the
> permanent storage locations such that subsequent references
> shall obtain data that was consistent with the permanent storage
> locations sometime between the call to msync() and the first
> subsequent memory reference to the data.
Again, this says that the _mapping_ is invalidated, and should match
persistent storage.
Any dirty bits in the mapping (ie anything that hasn't been msync'ed)
should be made persistent with permanent storage. Again, that is entirely
consistent with just throwing the mmap'ed page away (dirty state and all)
in a non-coherent environment.
I don't think we really have any modern Unixes with non-coherent mmap's
(although HP-UX used to be that way for a _loong_ time). But in the
timeframe that was written, it was probably still an issue.
Now, in a _coherent_ environment (like Linux) it should probably be a
no-op, since the mapping is always consistent with storage (where
"storage" doesn't actyally mean "disk", but the virtual file underneath
the mapping).
If the page is dirty in the page tables, we've modified the page contents
in the backing store (since we share it). But it would be consistent with
the standard wrt MS_INVALIDATE (but totally insane) to throw the dirty
state - and the page cache page - away if the page cache is clean.
The point being that a truly portable app can't really know what the hell
it does - it hass to know whether mmap is coherent or not, and if mmap is
coherent, then MS_INVALIDATE should _probably_ be a no-op.
(Which it is under modern Linux - MS_INVALIDATE is effectively a no-op,
except we still have the old check that you can't invalidate a locked
area. It _used_ to actually clear the page tables)
Linus
On Fri, 2006-02-10 at 14:46 -0800, Linus Torvalds wrote:
>
> On Fri, 10 Feb 2006, Trond Myklebust wrote:
> >
> > The Single Unix Spec appears to have a very different interpretation.
>
> Hmm. Very different wording, but same meaning, I think.
>
> > When MS_INVALIDATE is specified, msync() shall invalidate all
> > cached copies of mapped data that are inconsistent with the
> > permanent storage locations such that subsequent references
> > shall obtain data that was consistent with the permanent storage
> > locations sometime between the call to msync() and the first
> > subsequent memory reference to the data.
>
> Again, this says that the _mapping_ is invalidated, and should match
> persistent storage.
>
> Any dirty bits in the mapping (ie anything that hasn't been msync'ed)
> should be made persistent with permanent storage. Again, that is entirely
> consistent with just throwing the mmap'ed page away (dirty state and all)
> in a non-coherent environment.
>
> I don't think we really have any modern Unixes with non-coherent mmap's
> (although HP-UX used to be that way for a _loong_ time). But in the
> timeframe that was written, it was probably still an issue.
>
> Now, in a _coherent_ environment (like Linux) it should probably be a
> no-op, since the mapping is always consistent with storage (where
> "storage" doesn't actyally mean "disk", but the virtual file underneath
> the mapping).
Hmmm.... When talking about syncing to _permanent_ storage one usually
is talking about what is actually on the disk. In any case, we do have
non-coherent mmapped environments in Linux (need I mention NFS,
CIFS, ... ;-)?).
IIRC msync(MS_INVALIDATE) on Solaris was actually often used by some
applications to resync the client page cache to the server when using
odd locking schemes, so I believe this interpretation is a valid one.
Cheers,
Trond
On Fri, 10 Feb 2006, Trond Myklebust wrote:
> >
> > Now, in a _coherent_ environment (like Linux) it should probably be a
> > no-op, since the mapping is always consistent with storage (where
> > "storage" doesn't actyally mean "disk", but the virtual file underneath
> > the mapping).
>
> Hmmm.... When talking about syncing to _permanent_ storage one usually
> is talking about what is actually on the disk.
Ok, in that case Linux has never done what MS_INVALIDATE says, and I doubt
anybody else has either. It's neither sane nor even really doable (you'd
have to yank out everybody _elses_ caches too, not just your own).
So I think that within this context, the "permanent storage" really means
the "file" that is mapped, and doesn't care about whether it has actually
hit the disk yet.
> In any case, we do have non-coherent mmapped environments in Linux (need
> I mention NFS, CIFS, ... ;-)?).
Good point. That's an argument for actually dropping the local page cache
entirely on such a filesystem, since such a filesystem really isn't
fundamentally coherent.
However, that would be some really _nasty_ semantics, because it would
mean that something like NFS would behave very fundamentally differently
than a local filesystem, even if the user only actually uses it on the
local machine and there are no other writers ("..but there _could_ be
other writers that we don't know about").
So I'd have to veto that just on the grounds of trying to keep users sane.
> IIRC msync(MS_INVALIDATE) on Solaris was actually often used by some
> applications to resync the client page cache to the server when using
> odd locking schemes, so I believe this interpretation is a valid one.
I think you're right. Although I would also guess that 99% of the time,
you'd only do that for read-only mappings. Doing the same in the presense
of also doing writes is just asking for getting shot.
Even for read-only mappings, it's actually quite hard to globally flush a
page cache page if somebody else happens to be using it for a read() or
something at exactly the same time.
Linus
Linus Torvalds wrote:
>
> On Sat, 11 Feb 2006, Nick Piggin wrote:
>
>>Well in that case in your argument your FADV_WRITE_START is of
>>the "waits for writeout then starts writeout if dirty" type.
>>
>>In which case you've just made 3 consecutive write+wait cycles
>>to the same page, so it is hardly an optimal IO pattern.
>
>
> The point is, this is the interface that an app would want to use if they
> want _perfect_ IO patterns.
>
I'll be annoying and take you up on this again.
It is possible that my FADV_WRITE_SYNC will do an extra write of a page
if it has since become dirty again, however that would seem to be rare
for such a thing to happen (ie. because the app has asked for some previous
copy of data to be on disk).
I'm not saying it would never happen, your sub-page sized example is one
probably valid case - however in that case Andrew's wait-for-write doesn't
always do the right thing either.
But I will grant that start-writeout + wait-for-write must be at least as
expressive as write-and-wait.
*however*, it still isn't perfect and it still does things worse than my
proposal. For example, if the kernel itself actually decides to start writeout
before you call fadvise(FADV_START_WRITEOUT) then it is now going to block
and wait for the io to finish.
Anyway if we agree they are both much of a muchness, then I hope we can
go for FADV_WRITE_ASYNC, FADV_WRITE_SYNC because it is consistent with the
rest of the userspace API we expose.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
On Fri, 2006-02-10 at 15:15 -0800, Linus Torvalds wrote:
> > IIRC msync(MS_INVALIDATE) on Solaris was actually often used by some
> > applications to resync the client page cache to the server when using
> > odd locking schemes, so I believe this interpretation is a valid one.
>
> I think you're right. Although I would also guess that 99% of the time,
> you'd only do that for read-only mappings. Doing the same in the presense
> of also doing writes is just asking for getting shot.
>
> Even for read-only mappings, it's actually quite hard to globally flush a
> page cache page if somebody else happens to be using it for a read() or
> something at exactly the same time.
I'm thinking specifically of the case where the application is using
some fancy user space locking scheme of its own to guarantee safe read
access to a part of the file that is known to have changed on the
server.
We do have fadvise(POSIX_FADV_DONTNEED), which gets you most of the way.
However that calls invalidate_mapping_pages(), which only clears
unlocked pages. This again means that kernel activities like readahead
or VM scanning can cause pages the user would otherwise like to eject to
be preserved.
The other alternative is to use O_DIRECT file access, but that forces
the application to handle caching and readahead in user space too.
Cheers,
Trond