LinuxLists.cc - [PATCH -v8 0/4] Fixing the issue with memory-mapped file times

[permalink] [raw]

Subject: [PATCH -v8 2/4] Update ctime and mtime for memory-mapped files

Update ctime and mtime for memory-mapped files at a write access on
a present, read-only PTE, as well as at a write on a non-present PTE.

Signed-off-by: Anton Salikhmetov <[email protected]>
---
mm/memory.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 6dd1cd8..4b0144b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1670,6 +1670,9 @@ gotten:
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
@@ -2343,6 +2346,9 @@ out_unlocked:
if (anon)
page_cache_release(vmf.page);
else if (dirty_page) {
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
set_page_dirty_balance(dirty_page, page_mkwrite);
put_page(dirty_page);
}
--
1.4.4.4

2008-01-22 23:22:04

[permalink] [raw]

Subject: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

Force file times update at the next write reference after
calling the msync() system call with the MS_ASYNC flag.

Signed-off-by: Anton Salikhmetov <[email protected]>
---
mm/msync.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 82 insertions(+), 10 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index 60efa36..87f990e 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -5,6 +5,7 @@
* Copyright (C) 2008 Anton Salikhmetov <[email protected]>
*/

+#include <asm/tlbflush.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -12,6 +13,73 @@
#include <linux/sched.h>
#include <linux/syscalls.h>

+static void vma_wrprotect_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long start, unsigned long end)
+{
+ while (start < end) {
+ spinlock_t *ptl;
+ pte_t *pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
+
+ if (pte_dirty(*pte) && pte_write(*pte)) {
+ pte_t entry = ptep_clear_flush(vma, start, pte);
+
+ entry = pte_wrprotect(entry);
+ set_pte_at(vma->vm_mm, start, pte, entry);
+ }
+
+ pte_unmap_unlock(pte, ptl);
+ start += PAGE_SIZE;
+ }
+}
+
+static void vma_wrprotect_pud_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long start, unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, start);
+
+ while (start < end) {
+ unsigned long next = pmd_addr_end(start, end);
+
+ if (!pmd_none_or_clear_bad(pmd))
+ vma_wrprotect_pmd_range(vma, pmd, start, next);
+
+ ++pmd;
+ start = next;
+ }
+}
+
+static void vma_wrprotect_pgd_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long start, unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, start);
+
+ while (start < end) {
+ unsigned long next = pud_addr_end(start, end);
+
+ if (!pud_none_or_clear_bad(pud))
+ vma_wrprotect_pud_range(vma, pud, start, next);
+
+ ++pud;
+ start = next;
+ }
+}
+
+static void vma_wrprotect(struct vm_area_struct *vma)
+{
+ unsigned long addr = vma->vm_start;
+ pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
+
+ while (addr < vma->vm_end) {
+ unsigned long next = pgd_addr_end(addr, vma->vm_end);
+
+ if (!pgd_none_or_clear_bad(pgd))
+ vma_wrprotect_pgd_range(vma, pgd, addr, next);
+
+ ++pgd;
+ addr = next;
+ }
+}
+
/*
* MS_SYNC syncs the entire file - including mappings.
*
@@ -78,16 +146,20 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
error = 0;
start = vma->vm_end;
file = vma->vm_file;
- if (file && (vma->vm_flags & VM_SHARED) && (flags & MS_SYNC)) {
- get_file(file);
- up_read(&mm->mmap_sem);
- error = do_fsync(file, 0);
- fput(file);
- if (error || start >= end)
- goto out;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, start);
- continue;
+ if (file && (vma->vm_flags & VM_SHARED)) {
+ if ((flags & MS_ASYNC))
+ vma_wrprotect(vma);
+ if (flags & MS_SYNC) {
+ get_file(file);
+ up_read(&mm->mmap_sem);
+ error = do_fsync(file, 0);
+ fput(file);
+ if (error || start >= end)
+ goto out;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ continue;
+ }
}

vma = vma->vm_next;
--
1.4.4.4

2008-01-22 23:22:25

[permalink] [raw]

Subject: [PATCH -v8 1/4] Massive code cleanup of sys_msync()

Use the PAGE_ALIGN() macro instead of "manual" alignment.
Improve readability of the loop, which traverses the process
memory regions. Make code more symmetric and possibly boost
performance on some RISC CPUs by moving variable assignments.

Signed-off-by: Anton Salikhmetov <[email protected]>
---
mm/msync.c | 76 ++++++++++++++++++++++++++++-------------------------------
1 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index 144a757..60efa36 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -1,85 +1,84 @@
/*
- * linux/mm/msync.c
+ * The msync() system call.
*
- * Copyright (C) 1994-1999 Linus Torvalds
+ * Copyright (C) 1994-1999 Linus Torvalds
+ * Copyright (C) 2008 Anton Salikhmetov <[email protected]>
*/

-/*
- * The msync() system call.
- */
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/syscalls.h>
#include <linux/sched.h>
+#include <linux/syscalls.h>

/*
* MS_SYNC syncs the entire file - including mappings.
*
* MS_ASYNC does not start I/O (it used to, up to 2.5.67).
- * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
+ * Nor does it mark the relevant pages dirty (it used to up to 2.6.17).
* Now it doesn't do anything, since dirty pages are properly tracked.
*
- * The application may now run fsync() to
- * write out the dirty pages and wait on the writeout and check the result.
- * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
- * async writeout immediately.
+ * The application may now run fsync() to write out the dirty pages and
+ * wait on the writeout and check the result. Or the application may run
+ * fadvise(FADV_DONTNEED) against the fd to start async writeout immediately.
* So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications.
*/
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
- struct mm_struct *mm = current->mm;
+ int error, unmapped_error;
struct vm_area_struct *vma;
- int unmapped_error = 0;
- int error = -EINVAL;
+ struct mm_struct *mm;

+ error = -EINVAL;
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (start & ~PAGE_MASK)
goto out;
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
goto out;
+
error = -ENOMEM;
- len = (len + ~PAGE_MASK) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
goto out;
+
error = 0;
+ unmapped_error = 0;
if (end == start)
goto out;
+
/*
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
+ mm = current->mm;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
- for (;;) {
+ do {
struct file *file;

- /* Still start < end. */
error = -ENOMEM;
if (!vma)
- goto out_unlock;
- /* Here start < vma->vm_end. */
+ break;
if (start < vma->vm_start) {
start = vma->vm_start;
if (start >= end)
- goto out_unlock;
- unmapped_error = -ENOMEM;
- }
- /* Here vma->vm_start <= start < vma->vm_end. */
- if ((flags & MS_INVALIDATE) &&
- (vma->vm_flags & VM_LOCKED)) {
- error = -EBUSY;
- goto out_unlock;
+ break;
+ unmapped_error = error;
}
- file = vma->vm_file;
+
+ error = -EBUSY;
+ if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
+ break;
+
+ error = 0;
start = vma->vm_end;
- if ((flags & MS_SYNC) && file &&
- (vma->vm_flags & VM_SHARED)) {
+ file = vma->vm_file;
+ if (file && (vma->vm_flags & VM_SHARED) && (flags & MS_SYNC)) {
get_file(file);
up_read(&mm->mmap_sem);
error = do_fsync(file, 0);
@@ -88,16 +87,13 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
goto out;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
- } else {
- if (start >= end) {
- error = 0;
- goto out_unlock;
- }
- vma = vma->vm_next;
+ continue;
}
- }
-out_unlock:
+
+ vma = vma->vm_next;
+ } while (start < end);
up_read(&mm->mmap_sem);
+
out:
- return error ? : unmapped_error;
+ return error ? error : unmapped_error;
}
--
1.4.4.4

2008-01-22 23:22:41

[permalink] [raw]

Subject: [PATCH -v8 4/4] The design document for memory-mapped file times update

Add a document, which describes how the POSIX requirements on updating
memory-mapped file times are addressed in Linux.

Signed-off-by: Anton Salikhmetov <[email protected]>
---
Documentation/vm/00-INDEX | 2 +
Documentation/vm/msync.txt | 117 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 119 insertions(+), 0 deletions(-)

diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 2131b00..2726c8d 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -6,6 +6,8 @@ hugetlbpage.txt
- a brief summary of hugetlbpage support in the Linux kernel.
locking
- info on how locking and synchronization is done in the Linux vm code.
+msync.txt
+ - the design document for memory-mapped file times update
numa
- information about NUMA specific code in the Linux vm.
numa_memory_policy.txt
diff --git a/Documentation/vm/msync.txt b/Documentation/vm/msync.txt
new file mode 100644
index 0000000..571a766
--- /dev/null
+++ b/Documentation/vm/msync.txt
@@ -0,0 +1,117 @@
+
+ The msync() system call and memory-mapped file times
+
+ Copyright (C) 2008 Anton Salikhmetov
+
+The POSIX standard requires that any write reference to memory-mapped file
+data should result in updating the ctime and mtime for that file. Moreover,
+the standard mandates that updated file times should become visible to the
+world no later than at the next call to msync().
+
+Failure to meet this requirement creates difficulties for certain classes
+of important applications. For instance, database backup systems fail to
+pick up the files modified via the mmap() interface. Also, this is a
+security hole, which allows forging file data in such a manner that proving
+the fact that file data was modified is not possible.
+
+Briefly put, this requirement can be stated as follows:
+
+ once the file data has changed, the operating system
+ should acknowledge this fact by updating file metadata.
+
+This document describes how this POSIX requirement is addressed in Linux.
+
+1. Requirements
+
+1.1) the POSIX standard requires updating ctime and mtime not later
+than at the call to msync() with MS_SYNC or MS_ASYNC flags;
+
+1.2) in existing POSIX implementations, ctime and mtime
+get updated not later than at the call to fsync();
+
+1.3) in existing POSIX implementation, ctime and mtime
+get updated not later than at the call to sync(), the "auto-update" feature;
+
+1.4) the customers require and the common sense suggests that
+ctime and mtime should be updated not later than at the call to munmap()
+or exit(), the latter function implying an implicit call to munmap();
+
+1.5) the (1.1) item should be satisfied if the file is a block device
+special file;
+
+1.6) the (1.1) item should be satisfied for files residing on
+memory-backed filesystems such as tmpfs, too.
+
+The following operating systems were used as the reference platforms
+and are referred to as the "existing implementations" above:
+HP-UX B.11.31 and FreeBSD 6.2-RELEASE.
+
+2. Lazy update
+
+Many attempts before the current version implemented the "lazy update" approach
+to satisfying the requirements given above. Within the latter approach, ctime
+and mtime get updated at last moment allowable.
+
+Since we don't update the file times immediately, some Flag has to be
+used. When up, this Flag means that the file data was modified and
+the file times need to be updated as soon as possible.
+
+Any existing "dirty" flag which, when up, mean that a page has been written to,
+is not suitable for this purpose. Indeed, msync() called with MS_ASYNC
+would have to reset this "dirty" flag after updating ctime and mtime.
+The sys_msync() function itself is basically a no-op in the MS_ASYNC case.
+Thereby, the synchronization routines relying upon this "dirty" flag
+would lose data. Therefore, a new Flag has to be introduced.
+
+The (1.5) item coupled with (1.3) requirement leads to hard work with
+the block device inodes. Specifically, during writeback it is impossible to
+tell which block device file was originally mapped. Therefore, we need to
+traverse the list of "active" devices associated with the block device inode.
+This would lead to updating file times for block device files, which were not
+taking part in the data transfer.
+
+Also all versions prior to version 6 failed to correctly process ctime and
+mtime for files on the memory-backed filesystems such as tmpfs. So the (1.6)
+requirement was not satisfied.
+
+If a write reference has occurred between two consecutive calls to msync()
+with MS_ASYNC, the second call to the latter function should take into
+account the last write reference. The last write reference can not be caught
+if no pagefault occurs. Hence a pagefault needs to be forced. This can be done
+using two different approaches. The first one is to synchronize data even when
+msync() was called with MS_ASYNC. This is not acceptable because the current
+design of the sys_msync() routine forbids starting I/O for the MS_ASYNC case.
+The second approach is to write protect the page for triggering a pagefault
+at the next write reference. Note that the dirty flag for the page should not
+be cleared thereby.
+
+In the "lazy update" approach, the requirements (1.1), (1.2), (1.3), and (1.4)
+taken together result in adding code at least to the following kernel routines:
+sys_msync(), do_fsync(), some routine in the unmap() call path, some routine
+in the sync() call path.
+
+Finally, a file_update_time()-like function would have to be created for
+processing the inode objects, not file objects. This is due to the fact that
+during the sync() operation, the file object may not exist any more, only
+the inode is known.
+
+To sum up: this "lazy" approach leads to massive changes, incurs overhead in
+the block device case, and requires complicated design decisions.
+
+3. Immediate update
+
+OK, still reading? There's a better way.
+
+In a fashion analogous to what happens at write(2), react to the fact
+that the page gets dirtied by updating the file times immediately.
+Thereby any page writeback happens when the write reference has already
+been accounted for from the view point of file times.
+
+The only problem which remains is to force refreshing file times at the write
+reference following a call to msync() with MS_ASYNC. As mentioned above, all
+that is needed here is to force a pagefault.
+
+The vma_wrprotect() routine introduced in this patch series is called
+from sys_msync() in the MS_ASYNC case. The former routine is essentially
+a version of existing page_mkclean_one() function from mm/rmap.c. Unlike
+the latter function, the vma_wrprotect() does not touch the dirty bit.
--
1.4.4.4

2008-01-23 08:47:42

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 2008-01-23 at 02:21 +0300, Anton Salikhmetov wrote:

> +static void vma_wrprotect_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
> + unsigned long start, unsigned long end)
> +{
> + while (start < end) {
> + spinlock_t *ptl;
> + pte_t *pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
> +
> + if (pte_dirty(*pte) && pte_write(*pte)) {
> + pte_t entry = ptep_clear_flush(vma, start, pte);
> +
> + entry = pte_wrprotect(entry);
> + set_pte_at(vma->vm_mm, start, pte, entry);
> + }
> +
> + pte_unmap_unlock(pte, ptl);
> + start += PAGE_SIZE;
> + }
> +}

You've had two examples on how to write this loop, one from git commit
204ec841fbea3e5138168edbc3a76d46747cc987, and one from my draft, but
this one looks like neither and is much less efficient. Take the lock
only once per pmd, not once per pte please.

> +static void vma_wrprotect_pud_range(struct vm_area_struct *vma, pud_t *pud,
> + unsigned long start, unsigned long end)
> +{
> + pmd_t *pmd = pmd_offset(pud, start);
> +
> + while (start < end) {
> + unsigned long next = pmd_addr_end(start, end);
> +
> + if (!pmd_none_or_clear_bad(pmd))
> + vma_wrprotect_pmd_range(vma, pmd, start, next);
> +
> + ++pmd;
> + start = next;
> + }
> +}
> +
> +static void vma_wrprotect_pgd_range(struct vm_area_struct *vma, pgd_t *pgd,
> + unsigned long start, unsigned long end)
> +{
> + pud_t *pud = pud_offset(pgd, start);
> +
> + while (start < end) {
> + unsigned long next = pud_addr_end(start, end);
> +
> + if (!pud_none_or_clear_bad(pud))
> + vma_wrprotect_pud_range(vma, pud, start, next);
> +
> + ++pud;
> + start = next;
> + }
> +}
> +
> +static void vma_wrprotect(struct vm_area_struct *vma)
> +{
> + unsigned long addr = vma->vm_start;
> + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> +
> + while (addr < vma->vm_end) {
> + unsigned long next = pgd_addr_end(addr, vma->vm_end);
> +
> + if (!pgd_none_or_clear_bad(pgd))
> + vma_wrprotect_pgd_range(vma, pgd, addr, next);
> +
> + ++pgd;
> + addr = next;
> + }
> +}

I think you want to pass start, end here too, you might not need to
sweep the whole vma.

2008-01-23 08:51:38

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 2008-01-23 at 09:47 +0100, Peter Zijlstra wrote:
> On Wed, 2008-01-23 at 02:21 +0300, Anton Salikhmetov wrote:

> > +static void vma_wrprotect(struct vm_area_struct *vma)
> > +{
> > + unsigned long addr = vma->vm_start;
> > + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> > +
> > + while (addr < vma->vm_end) {
> > + unsigned long next = pgd_addr_end(addr, vma->vm_end);
> > +
> > + if (!pgd_none_or_clear_bad(pgd))
> > + vma_wrprotect_pgd_range(vma, pgd, addr, next);
> > +
> > + ++pgd;
> > + addr = next;
> > + }
> > +}
>
> I think you want to pass start, end here too, you might not need to
> sweep the whole vma.

Also, it still doesn't make sense to me why we'd not need to walk the
rmap, it is all the same file after all.

2008-01-23 09:26:34

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

I think it would be more logical to move this patch forward, before
the two patches which this document is actually describing.

> Add a document, which describes how the POSIX requirements on updating
> memory-mapped file times are addressed in Linux.
>
> Signed-off-by: Anton Salikhmetov <[email protected]>
> ---
> Documentation/vm/00-INDEX | 2 +
> Documentation/vm/msync.txt | 117 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 119 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
> index 2131b00..2726c8d 100644
> --- a/Documentation/vm/00-INDEX
> +++ b/Documentation/vm/00-INDEX
> @@ -6,6 +6,8 @@ hugetlbpage.txt
> - a brief summary of hugetlbpage support in the Linux kernel.
> locking
> - info on how locking and synchronization is done in the Linux vm code.
> +msync.txt
> + - the design document for memory-mapped file times update
> numa
> - information about NUMA specific code in the Linux vm.
> numa_memory_policy.txt
> diff --git a/Documentation/vm/msync.txt b/Documentation/vm/msync.txt
> new file mode 100644
> index 0000000..571a766
> --- /dev/null
> +++ b/Documentation/vm/msync.txt
> @@ -0,0 +1,117 @@
> +
> + The msync() system call and memory-mapped file times
> +
> + Copyright (C) 2008 Anton Salikhmetov
> +
> +The POSIX standard requires that any write reference to memory-mapped file
> +data should result in updating the ctime and mtime for that file. Moreover,
> +the standard mandates that updated file times should become visible to the
> +world no later than at the next call to msync().
> +
> +Failure to meet this requirement creates difficulties for certain classes
> +of important applications. For instance, database backup systems fail to
> +pick up the files modified via the mmap() interface. Also, this is a
> +security hole, which allows forging file data in such a manner that proving
> +the fact that file data was modified is not possible.
> +
> +Briefly put, this requirement can be stated as follows:
> +
> + once the file data has changed, the operating system
> + should acknowledge this fact by updating file metadata.
> +
> +This document describes how this POSIX requirement is addressed in Linux.
> +
> +1. Requirements
> +
> +1.1) the POSIX standard requires updating ctime and mtime not later
> +than at the call to msync() with MS_SYNC or MS_ASYNC flags;
> +
> +1.2) in existing POSIX implementations, ctime and mtime
> +get updated not later than at the call to fsync();
> +
> +1.3) in existing POSIX implementation, ctime and mtime
> +get updated not later than at the call to sync(), the "auto-update" feature;
> +
> +1.4) the customers require and the common sense suggests that
> +ctime and mtime should be updated not later than at the call to munmap()
> +or exit(), the latter function implying an implicit call to munmap();
> +
> +1.5) the (1.1) item should be satisfied if the file is a block device
> +special file;
> +
> +1.6) the (1.1) item should be satisfied for files residing on
> +memory-backed filesystems such as tmpfs, too.
> +
> +The following operating systems were used as the reference platforms
> +and are referred to as the "existing implementations" above:
> +HP-UX B.11.31 and FreeBSD 6.2-RELEASE.
> +
> +2. Lazy update
> +
> +Many attempts before the current version implemented the "lazy update" approach
> +to satisfying the requirements given above. Within the latter approach, ctime
> +and mtime get updated at last moment allowable.
> +
> +Since we don't update the file times immediately, some Flag has to be
> +used. When up, this Flag means that the file data was modified and
> +the file times need to be updated as soon as possible.
> +
> +Any existing "dirty" flag which, when up, mean that a page has been written to,
> +is not suitable for this purpose. Indeed, msync() called with MS_ASYNC
> +would have to reset this "dirty" flag after updating ctime and mtime.
> +The sys_msync() function itself is basically a no-op in the MS_ASYNC case.
> +Thereby, the synchronization routines relying upon this "dirty" flag
> +would lose data. Therefore, a new Flag has to be introduced.
> +
> +The (1.5) item coupled with (1.3) requirement leads to hard work with
> +the block device inodes. Specifically, during writeback it is impossible to
> +tell which block device file was originally mapped. Therefore, we need to
> +traverse the list of "active" devices associated with the block device inode.
> +This would lead to updating file times for block device files, which were not
> +taking part in the data transfer.
> +
> +Also all versions prior to version 6 failed to correctly process ctime and
> +mtime for files on the memory-backed filesystems such as tmpfs. So the (1.6)
> +requirement was not satisfied.

Version -8 also fails: for ram backed filesystems page tables are not
write protected initially, nor after a sync. This patch does write
protect them after an MS_ASYNC, but that's a bug in the current
context.

> +
> +If a write reference has occurred between two consecutive calls to msync()
> +with MS_ASYNC, the second call to the latter function should take into
> +account the last write reference. The last write reference can not be caught
> +if no pagefault occurs. Hence a pagefault needs to be forced. This can be done
> +using two different approaches. The first one is to synchronize data even when
> +msync() was called with MS_ASYNC. This is not acceptable because the current
> +design of the sys_msync() routine forbids starting I/O for the MS_ASYNC case.

I don't think anyone forbids starting I/O, it's just too expensive,
especially if it means, waiting for previous writeback on page to
finish first.

> +The second approach is to write protect the page for triggering a pagefault
> +at the next write reference. Note that the dirty flag for the page should not
> +be cleared thereby.
> +
> +In the "lazy update" approach, the requirements (1.1), (1.2), (1.3), and (1.4)
> +taken together result in adding code at least to the following kernel routines:
> +sys_msync(), do_fsync(), some routine in the unmap() call path, some routine
> +in the sync() call path.
> +
> +Finally, a file_update_time()-like function would have to be created for
> +processing the inode objects, not file objects. This is due to the fact that
> +during the sync() operation, the file object may not exist any more, only
> +the inode is known.
> +
> +To sum up: this "lazy" approach leads to massive changes, incurs overhead in
> +the block device case, and requires complicated design decisions.
> +
> +3. Immediate update
> +
> +OK, still reading? There's a better way.
> +
> +In a fashion analogous to what happens at write(2), react to the fact
> +that the page gets dirtied by updating the file times immediately.
> +Thereby any page writeback happens when the write reference has already
> +been accounted for from the view point of file times.
> +
> +The only problem which remains is to force refreshing file times at the write
> +reference following a call to msync() with MS_ASYNC. As mentioned above, all
> +that is needed here is to force a pagefault.
> +
> +The vma_wrprotect() routine introduced in this patch series is called
> +from sys_msync() in the MS_ASYNC case. The former routine is essentially
> +a version of existing page_mkclean_one() function from mm/rmap.c. Unlike
> +the latter function, the vma_wrprotect() does not touch the dirty bit.

Benchmark results should be also added to the relevant sections, I
think. There is a very definite cost to all this, and a 10x slowdown
is usually not taken lightly...

Great document, btw :)

Miklos

2008-01-23 09:35:08

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> On Wed, 2008-01-23 at 09:47 +0100, Peter Zijlstra wrote:
> > On Wed, 2008-01-23 at 02:21 +0300, Anton Salikhmetov wrote:
>
> > > +static void vma_wrprotect(struct vm_area_struct *vma)
> > > +{
> > > + unsigned long addr = vma->vm_start;
> > > + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> > > +
> > > + while (addr < vma->vm_end) {
> > > + unsigned long next = pgd_addr_end(addr, vma->vm_end);
> > > +
> > > + if (!pgd_none_or_clear_bad(pgd))
> > > + vma_wrprotect_pgd_range(vma, pgd, addr, next);
> > > +
> > > + ++pgd;
> > > + addr = next;
> > > + }
> > > +}
> >
> > I think you want to pass start, end here too, you might not need to
> > sweep the whole vma.
>
> Also, it still doesn't make sense to me why we'd not need to walk the
> rmap, it is all the same file after all.

It's the same file, but not the same memory map. It basically depends
on how you define msync:

a) sync _file_ on region defined by this mmap/start/end-address
b) sync _memory_region_ defined by start/end-address

b) is a perfectly fine definition, and it's consistent with what this
code does. The fact that POSIX probably implies a) (in a rather
poorly defined way) doesn't make much difference, I think.

Miklos

2008-01-23 09:41:18

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> Force file times update at the next write reference after
> calling the msync() system call with the MS_ASYNC flag.
>
> Signed-off-by: Anton Salikhmetov <[email protected]>
> ---
> mm/msync.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
> 1 files changed, 82 insertions(+), 10 deletions(-)
>
> diff --git a/mm/msync.c b/mm/msync.c
> index 60efa36..87f990e 100644
> --- a/mm/msync.c
> +++ b/mm/msync.c
> @@ -5,6 +5,7 @@
> * Copyright (C) 2008 Anton Salikhmetov <[email protected]>
> */
>
> +#include <asm/tlbflush.h>
> #include <linux/file.h>
> #include <linux/fs.h>
> #include <linux/mm.h>
> @@ -12,6 +13,73 @@
> #include <linux/sched.h>
> #include <linux/syscalls.h>
>
> +static void vma_wrprotect_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
> + unsigned long start, unsigned long end)
> +{
> + while (start < end) {
> + spinlock_t *ptl;
> + pte_t *pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
> +
> + if (pte_dirty(*pte) && pte_write(*pte)) {
> + pte_t entry = ptep_clear_flush(vma, start, pte);
> +
> + entry = pte_wrprotect(entry);
> + set_pte_at(vma->vm_mm, start, pte, entry);
> + }
> +
> + pte_unmap_unlock(pte, ptl);
> + start += PAGE_SIZE;
> + }
> +}

Why can't the pte_offset_map_lock/unlock be moved outside the loop, as
in Peter's example? My guess is, it could have some impact on
performance.

> +
> +static void vma_wrprotect_pud_range(struct vm_area_struct *vma, pud_t *pud,
> + unsigned long start, unsigned long end)
> +{
> + pmd_t *pmd = pmd_offset(pud, start);
> +
> + while (start < end) {
> + unsigned long next = pmd_addr_end(start, end);
> +
> + if (!pmd_none_or_clear_bad(pmd))
> + vma_wrprotect_pmd_range(vma, pmd, start, next);
> +
> + ++pmd;
> + start = next;
> + }
> +}
> +
> +static void vma_wrprotect_pgd_range(struct vm_area_struct *vma, pgd_t *pgd,
> + unsigned long start, unsigned long end)
> +{
> + pud_t *pud = pud_offset(pgd, start);
> +
> + while (start < end) {
> + unsigned long next = pud_addr_end(start, end);
> +
> + if (!pud_none_or_clear_bad(pud))
> + vma_wrprotect_pud_range(vma, pud, start, next);
> +
> + ++pud;
> + start = next;
> + }
> +}
> +
> +static void vma_wrprotect(struct vm_area_struct *vma)
> +{
> + unsigned long addr = vma->vm_start;
> + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> +

Need to check mapping_cap_account_dirty(). Otherwise we would be
inconsistent with write protecting ram-backed filesystems.

> + while (addr < vma->vm_end) {
> + unsigned long next = pgd_addr_end(addr, vma->vm_end);
> +
> + if (!pgd_none_or_clear_bad(pgd))
> + vma_wrprotect_pgd_range(vma, pgd, addr, next);
> +
> + ++pgd;
> + addr = next;
> + }
> +}
> +
> /*
> * MS_SYNC syncs the entire file - including mappings.
> *
> @@ -78,16 +146,20 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
> error = 0;
> start = vma->vm_end;
> file = vma->vm_file;
> - if (file && (vma->vm_flags & VM_SHARED) && (flags & MS_SYNC)) {
> - get_file(file);
> - up_read(&mm->mmap_sem);
> - error = do_fsync(file, 0);
> - fput(file);
> - if (error || start >= end)
> - goto out;
> - down_read(&mm->mmap_sem);
> - vma = find_vma(mm, start);
> - continue;
> + if (file && (vma->vm_flags & VM_SHARED)) {
> + if ((flags & MS_ASYNC))
> + vma_wrprotect(vma);
> + if (flags & MS_SYNC) {
> + get_file(file);
> + up_read(&mm->mmap_sem);
> + error = do_fsync(file, 0);
> + fput(file);
> + if (error || start >= end)
> + goto out;
> + down_read(&mm->mmap_sem);
> + vma = find_vma(mm, start);
> + continue;
> + }
> }
>
> vma = vma->vm_next;
> --
> 1.4.4.4
>
>

2008-01-23 09:51:18

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> > Also, it still doesn't make sense to me why we'd not need to walk the
> > rmap, it is all the same file after all.
>
> It's the same file, but not the same memory map. It basically depends
> on how you define msync:
>
> a) sync _file_ on region defined by this mmap/start/end-address
> b) sync _memory_region_ defined by start/end-address

My mmap/msync tester program can acually check this as well, with the
'-f' flag. Anton, can you try that on the reference platforms?

Miklos

2008-01-23 10:37:19

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

2008/1/23, Miklos Szeredi <[email protected]>:
> I think it would be more logical to move this patch forward, before
> the two patches which this document is actually describing.
>
> > Add a document, which describes how the POSIX requirements on updating
> > memory-mapped file times are addressed in Linux.
> >
> > Signed-off-by: Anton Salikhmetov <[email protected]>
> > ---
> > Documentation/vm/00-INDEX | 2 +
> > Documentation/vm/msync.txt | 117 ++++++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 119 insertions(+), 0 deletions(-)
> >
> > diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
> > index 2131b00..2726c8d 100644
> > --- a/Documentation/vm/00-INDEX
> > +++ b/Documentation/vm/00-INDEX
> > @@ -6,6 +6,8 @@ hugetlbpage.txt
> > - a brief summary of hugetlbpage support in the Linux kernel.
> > locking
> > - info on how locking and synchronization is done in the Linux vm code.
> > +msync.txt
> > + - the design document for memory-mapped file times update
> > numa
> > - information about NUMA specific code in the Linux vm.
> > numa_memory_policy.txt
> > diff --git a/Documentation/vm/msync.txt b/Documentation/vm/msync.txt
> > new file mode 100644
> > index 0000000..571a766
> > --- /dev/null
> > +++ b/Documentation/vm/msync.txt
> > @@ -0,0 +1,117 @@
> > +
> > + The msync() system call and memory-mapped file times
> > +
> > + Copyright (C) 2008 Anton Salikhmetov
> > +
> > +The POSIX standard requires that any write reference to memory-mapped file
> > +data should result in updating the ctime and mtime for that file. Moreover,
> > +the standard mandates that updated file times should become visible to the
> > +world no later than at the next call to msync().
> > +
> > +Failure to meet this requirement creates difficulties for certain classes
> > +of important applications. For instance, database backup systems fail to
> > +pick up the files modified via the mmap() interface. Also, this is a
> > +security hole, which allows forging file data in such a manner that proving
> > +the fact that file data was modified is not possible.
> > +
> > +Briefly put, this requirement can be stated as follows:
> > +
> > + once the file data has changed, the operating system
> > + should acknowledge this fact by updating file metadata.
> > +
> > +This document describes how this POSIX requirement is addressed in Linux.
> > +
> > +1. Requirements
> > +
> > +1.1) the POSIX standard requires updating ctime and mtime not later
> > +than at the call to msync() with MS_SYNC or MS_ASYNC flags;
> > +
> > +1.2) in existing POSIX implementations, ctime and mtime
> > +get updated not later than at the call to fsync();
> > +
> > +1.3) in existing POSIX implementation, ctime and mtime
> > +get updated not later than at the call to sync(), the "auto-update" feature;
> > +
> > +1.4) the customers require and the common sense suggests that
> > +ctime and mtime should be updated not later than at the call to munmap()
> > +or exit(), the latter function implying an implicit call to munmap();
> > +
> > +1.5) the (1.1) item should be satisfied if the file is a block device
> > +special file;
> > +
> > +1.6) the (1.1) item should be satisfied for files residing on
> > +memory-backed filesystems such as tmpfs, too.
> > +
> > +The following operating systems were used as the reference platforms
> > +and are referred to as the "existing implementations" above:
> > +HP-UX B.11.31 and FreeBSD 6.2-RELEASE.
> > +
> > +2. Lazy update
> > +
> > +Many attempts before the current version implemented the "lazy update" approach
> > +to satisfying the requirements given above. Within the latter approach, ctime
> > +and mtime get updated at last moment allowable.
> > +
> > +Since we don't update the file times immediately, some Flag has to be
> > +used. When up, this Flag means that the file data was modified and
> > +the file times need to be updated as soon as possible.
> > +
> > +Any existing "dirty" flag which, when up, mean that a page has been written to,
> > +is not suitable for this purpose. Indeed, msync() called with MS_ASYNC
> > +would have to reset this "dirty" flag after updating ctime and mtime.
> > +The sys_msync() function itself is basically a no-op in the MS_ASYNC case.
> > +Thereby, the synchronization routines relying upon this "dirty" flag
> > +would lose data. Therefore, a new Flag has to be introduced.
> > +
> > +The (1.5) item coupled with (1.3) requirement leads to hard work with
> > +the block device inodes. Specifically, during writeback it is impossible to
> > +tell which block device file was originally mapped. Therefore, we need to
> > +traverse the list of "active" devices associated with the block device inode.
> > +This would lead to updating file times for block device files, which were not
> > +taking part in the data transfer.
> > +
> > +Also all versions prior to version 6 failed to correctly process ctime and
> > +mtime for files on the memory-backed filesystems such as tmpfs. So the (1.6)
> > +requirement was not satisfied.
>
> Version -8 also fails: for ram backed filesystems page tables are not
> write protected initially, nor after a sync. This patch does write
> protect them after an MS_ASYNC, but that's a bug in the current
> context.

I've already written in the cover letter that functional tests passed
successfully.
In particular, this means that everything is OK with tmpfs too:

debian:~/times# ./times /mnt/file
begin 1201084493 1201084493 1201084281
write 1201084494 1201084494 1201084281
mmap 1201084494 1201084494 1201084495
b 1201084496 1201084496 1201084495
msync b 1201084496 1201084496 1201084495
c 1201084498 1201084498 1201084495
msync c 1201084498 1201084498 1201084495
d 1201084500 1201084500 1201084495
munmap 1201084500 1201084500 1201084495
close 1201084500 1201084500 1201084495
sync 1201084500 1201084500 1201084495
debian:~/times# mount | grep mnt
tmpfs on /mnt type tmpfs (rw)
debian:~/times#

>
> > +
> > +If a write reference has occurred between two consecutive calls to msync()
> > +with MS_ASYNC, the second call to the latter function should take into
> > +account the last write reference. The last write reference can not be caught
> > +if no pagefault occurs. Hence a pagefault needs to be forced. This can be done
> > +using two different approaches. The first one is to synchronize data even when
> > +msync() was called with MS_ASYNC. This is not acceptable because the current
> > +design of the sys_msync() routine forbids starting I/O for the MS_ASYNC case.
>
> I don't think anyone forbids starting I/O, it's just too expensive,
> especially if it means, waiting for previous writeback on page to
> finish first.
>
> > +The second approach is to write protect the page for triggering a pagefault
> > +at the next write reference. Note that the dirty flag for the page should not
> > +be cleared thereby.
> > +
> > +In the "lazy update" approach, the requirements (1.1), (1.2), (1.3), and (1.4)
> > +taken together result in adding code at least to the following kernel routines:
> > +sys_msync(), do_fsync(), some routine in the unmap() call path, some routine
> > +in the sync() call path.
> > +
> > +Finally, a file_update_time()-like function would have to be created for
> > +processing the inode objects, not file objects. This is due to the fact that
> > +during the sync() operation, the file object may not exist any more, only
> > +the inode is known.
> > +
> > +To sum up: this "lazy" approach leads to massive changes, incurs overhead in
> > +the block device case, and requires complicated design decisions.
> > +
> > +3. Immediate update
> > +
> > +OK, still reading? There's a better way.
> > +
> > +In a fashion analogous to what happens at write(2), react to the fact
> > +that the page gets dirtied by updating the file times immediately.
> > +Thereby any page writeback happens when the write reference has already
> > +been accounted for from the view point of file times.
> > +
> > +The only problem which remains is to force refreshing file times at the write
> > +reference following a call to msync() with MS_ASYNC. As mentioned above, all
> > +that is needed here is to force a pagefault.
> > +
> > +The vma_wrprotect() routine introduced in this patch series is called
> > +from sys_msync() in the MS_ASYNC case. The former routine is essentially
> > +a version of existing page_mkclean_one() function from mm/rmap.c. Unlike
> > +the latter function, the vma_wrprotect() does not touch the dirty bit.
>
> Benchmark results should be also added to the relevant sections, I
> think. There is a very definite cost to all this, and a 10x slowdown
> is usually not taken lightly...
>
> Great document, btw :)
>
> Miklos
>

2008-01-23 10:53:22

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

> I've already written in the cover letter that functional tests passed
> successfully.
>
> debian:~/times# ./times /mnt/file
> begin 1201084493 1201084493 1201084281
> write 1201084494 1201084494 1201084281
> mmap 1201084494 1201084494 1201084495
> b 1201084496 1201084496 1201084495

Ah, OK, this is becuase mmap doesn't actually set up the page tables
by default. Try adding MAP_POPULATE to the flags.

Please also try

./times /mnt/file -s

Thanks,
Miklos

2008-01-23 11:16:51

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

> Ah, OK, this is becuase mmap doesn't actually set up the page tables
> by default. Try adding MAP_POPULATE to the flags.

Here's an updated version of the program, with an added a '-r' option,
that performs a read access on the mapping before doing the write
(basically equivalent to MAP_POPULATE, but portable).

Please try this on a tmpfs file.

Thanks,
Miklos

---
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/wait.h>

static const char *filename;
static int msync_flag = MS_ASYNC;
static int msync_fork = 0;
static int msync_read = 0;

static void print_times(const char *msg)
{
struct stat stbuf;
stat(filename, &stbuf);
printf("%s\t%li\t%li\t%li\n", msg, stbuf.st_ctime, stbuf.st_mtime,
stbuf.st_atime);
}

static void do_msync(void *addr, int len)
{
int res;
if (!msync_fork) {
res = msync(addr, len, msync_flag);
if (res == -1) {
perror("msync");
exit(1);
}
} else {
int pid = fork();
if (pid == -1) {
perror("fork");
exit(1);
}
if (!pid) {
int fd = open(filename, O_RDWR);
if (fd == -1) {
perror("open");
exit(1);
}
addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
exit(1);
}
res = msync(addr, len, msync_flag);
if (res == -1) {
perror("msync");
exit(1);
}
exit(0);
}
wait(NULL);
}
}

static void usage(const char *progname)
{
fprintf(stderr, "usage: %s filename [-sfr]\n", progname);
fprintf(stderr, " -s: use MS_SYNC instead of MS_ASYNC\n");
fprintf(stderr, " -f: fork and perform msync in a different process\n");
fprintf(stderr, " -r: do a read access before each write access\n");
exit(1);
}

int main(int argc, char *argv[])
{
int res;
char *addr;
char tmp[32];
int fd;

if (argc < 2)
usage(argv[0]);

filename = argv[1];
if (argc > 2) {
char *s;
if (argc > 3)
usage(argv[0]);
s = argv[2];
if (s[0] != '-' || !s[1])
usage(argv[0]);
for (s++; *s; s++) {
switch (*s) {
case 's':
msync_flag = MS_SYNC;
break;
case 'f':
msync_fork = 1;
break;
case 'r':
msync_read = 1;
break;
default:
usage(argv[0]);
}
}
}

fd = open(filename, O_RDWR | O_TRUNC | O_CREAT, 0666);
if (fd == -1) {
perror(filename);
return 1;
}
print_times("begin");
sleep(1);
write(fd, "wxyz\n", 4);
print_times("write");
sleep(1);
addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap");
return 1;
}
print_times("mmap");
sleep(1);

if (msync_read) {
sprintf(tmp, "fetch %c", addr[1]);
print_times(tmp);
sleep(1);
}
addr[1] = 'b';
print_times("store b");
sleep(1);
do_msync(addr, 4);
print_times("msync");
sleep(1);

if (msync_read) {
sprintf(tmp, "fetch %c", addr[2]);
print_times(tmp);
sleep(1);
}
addr[2] = 'c';
print_times("store c");
sleep(1);
do_msync(addr, 4);
print_times("msync");
sleep(1);

if (msync_read) {
sprintf(tmp, "fetch %c", addr[3]);
print_times(tmp);
sleep(1);
}
addr[3] = 'd';
print_times("store d");
sleep(1);
res = munmap(addr, 4);
if (res == -1) {
perror("munmap");
return 1;
}
print_times("munmap");
sleep(1);

res = close(fd);
if (res == -1) {
perror("close");
return 1;
}
print_times("close");
sleep(1);
sync();
print_times("sync");

return 0;
}

2008-01-23 12:25:47

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

2008/1/23, Miklos Szeredi <[email protected]>:
> > Ah, OK, this is becuase mmap doesn't actually set up the page tables
> > by default. Try adding MAP_POPULATE to the flags.
>
> Here's an updated version of the program, with an added a '-r' option,
> that performs a read access on the mapping before doing the write
> (basically equivalent to MAP_POPULATE, but portable).
>
> Please try this on a tmpfs file.
>
> Thanks,
> Miklos
>
> ---
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <string.h>
> #include <fcntl.h>
> #include <sys/mman.h>
> #include <sys/stat.h>
> #include <sys/wait.h>
>
> static const char *filename;
> static int msync_flag = MS_ASYNC;
> static int msync_fork = 0;
> static int msync_read = 0;
>
> static void print_times(const char *msg)
> {
> struct stat stbuf;
> stat(filename, &stbuf);
> printf("%s\t%li\t%li\t%li\n", msg, stbuf.st_ctime, stbuf.st_mtime,
> stbuf.st_atime);
> }
>
> static void do_msync(void *addr, int len)
> {
> int res;
> if (!msync_fork) {
> res = msync(addr, len, msync_flag);
> if (res == -1) {
> perror("msync");
> exit(1);
> }
> } else {
> int pid = fork();
> if (pid == -1) {
> perror("fork");
> exit(1);
> }
> if (!pid) {
> int fd = open(filename, O_RDWR);
> if (fd == -1) {
> perror("open");
> exit(1);
> }
> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> if (addr == MAP_FAILED) {
> perror("mmap");
> exit(1);
> }
> res = msync(addr, len, msync_flag);
> if (res == -1) {
> perror("msync");
> exit(1);
> }
> exit(0);
> }
> wait(NULL);
> }
> }
>
> static void usage(const char *progname)
> {
> fprintf(stderr, "usage: %s filename [-sfr]\n", progname);
> fprintf(stderr, " -s: use MS_SYNC instead of MS_ASYNC\n");
> fprintf(stderr, " -f: fork and perform msync in a different process\n");
> fprintf(stderr, " -r: do a read access before each write access\n");
> exit(1);
> }
>
> int main(int argc, char *argv[])
> {
> int res;
> char *addr;
> char tmp[32];
> int fd;
>
> if (argc < 2)
> usage(argv[0]);
>
> filename = argv[1];
> if (argc > 2) {
> char *s;
> if (argc > 3)
> usage(argv[0]);
> s = argv[2];
> if (s[0] != '-' || !s[1])
> usage(argv[0]);
> for (s++; *s; s++) {
> switch (*s) {
> case 's':
> msync_flag = MS_SYNC;
> break;
> case 'f':
> msync_fork = 1;
> break;
> case 'r':
> msync_read = 1;
> break;
> default:
> usage(argv[0]);
> }
> }
> }
>
> fd = open(filename, O_RDWR | O_TRUNC | O_CREAT, 0666);
> if (fd == -1) {
> perror(filename);
> return 1;
> }
> print_times("begin");
> sleep(1);
> write(fd, "wxyz\n", 4);
> print_times("write");
> sleep(1);
> addr = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> if (addr == MAP_FAILED) {
> perror("mmap");
> return 1;
> }
> print_times("mmap");
> sleep(1);
>
> if (msync_read) {
> sprintf(tmp, "fetch %c", addr[1]);
> print_times(tmp);
> sleep(1);
> }
> addr[1] = 'b';
> print_times("store b");
> sleep(1);
> do_msync(addr, 4);
> print_times("msync");
> sleep(1);
>
> if (msync_read) {
> sprintf(tmp, "fetch %c", addr[2]);
> print_times(tmp);
> sleep(1);
> }
> addr[2] = 'c';
> print_times("store c");
> sleep(1);
> do_msync(addr, 4);
> print_times("msync");
> sleep(1);
>
> if (msync_read) {
> sprintf(tmp, "fetch %c", addr[3]);
> print_times(tmp);
> sleep(1);
> }
> addr[3] = 'd';
> print_times("store d");
> sleep(1);
> res = munmap(addr, 4);
> if (res == -1) {
> perror("munmap");
> return 1;
> }
> print_times("munmap");
> sleep(1);
>
> res = close(fd);
> if (res == -1) {
> perror("close");
> return 1;
> }
> print_times("close");
> sleep(1);
> sync();
> print_times("sync");
>
> return 0;
> }
>

Miklos, thanks for your program. Its output is given below.

debian:~/miklos# mount | grep mnt
tmpfs on /mnt type tmpfs (rw)
debian:~/miklos# ./miklos /mnt/file
begin 1201089989 1201089989 1201085868
write 1201089990 1201089990 1201085868
mmap 1201089990 1201089990 1201089991
store b 1201089992 1201089992 1201089991
msync 1201089992 1201089992 1201089991
store c 1201089994 1201089994 1201089991
msync 1201089994 1201089994 1201089991
store d 1201089996 1201089996 1201089991
munmap 1201089996 1201089996 1201089991
close 1201089996 1201089996 1201089991
sync 1201089996 1201089996 1201089991
debian:~/miklos# ./miklos /mnt/file -r
begin 1201090025 1201090025 1201089991
write 1201090026 1201090026 1201089991
mmap 1201090026 1201090026 1201090027
fetch x 1201090026 1201090026 1201090027
store b 1201090026 1201090026 1201090027
msync 1201090026 1201090026 1201090027
fetch y 1201090026 1201090026 1201090027
store c 1201090032 1201090032 1201090027
msync 1201090032 1201090032 1201090027
fetch z 1201090032 1201090032 1201090027
store d 1201090036 1201090036 1201090027
munmap 1201090036 1201090036 1201090027
close 1201090036 1201090036 1201090027
sync 1201090036 1201090036 1201090027
debian:~/miklos#

I think that after the patches applied the msync() system call does
everything, which it is expected to do. The issue you're now talking
about is one belonging to do_fsync() and the design of the tmpfs
driver itself, I believe. Indeed, when the first write in the "-r"
version of the test did not update the stamps, this is obviously not
an msync() guilt.

By the way, I would not like to move the "4/4" patch to the earlier
place, because then it would describe the functionality, which had not
been introduced yet.

2008-01-23 12:53:49

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

2008/1/23, Peter Zijlstra <[email protected]>:
>
> On Wed, 2008-01-23 at 02:21 +0300, Anton Salikhmetov wrote:
>
> > +static void vma_wrprotect_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
> > + unsigned long start, unsigned long end)
> > +{
> > + while (start < end) {
> > + spinlock_t *ptl;
> > + pte_t *pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
> > +
> > + if (pte_dirty(*pte) && pte_write(*pte)) {
> > + pte_t entry = ptep_clear_flush(vma, start, pte);
> > +
> > + entry = pte_wrprotect(entry);
> > + set_pte_at(vma->vm_mm, start, pte, entry);
> > + }
> > +
> > + pte_unmap_unlock(pte, ptl);
> > + start += PAGE_SIZE;
> > + }
> > +}
>
> You've had two examples on how to write this loop, one from git commit
> 204ec841fbea3e5138168edbc3a76d46747cc987, and one from my draft, but
> this one looks like neither and is much less efficient. Take the lock
> only once per pmd, not once per pte please.
>
> > +static void vma_wrprotect_pud_range(struct vm_area_struct *vma, pud_t *pud,
> > + unsigned long start, unsigned long end)
> > +{
> > + pmd_t *pmd = pmd_offset(pud, start);
> > +
> > + while (start < end) {
> > + unsigned long next = pmd_addr_end(start, end);
> > +
> > + if (!pmd_none_or_clear_bad(pmd))
> > + vma_wrprotect_pmd_range(vma, pmd, start, next);
> > +
> > + ++pmd;
> > + start = next;
> > + }
> > +}
> > +
> > +static void vma_wrprotect_pgd_range(struct vm_area_struct *vma, pgd_t *pgd,
> > + unsigned long start, unsigned long end)
> > +{
> > + pud_t *pud = pud_offset(pgd, start);
> > +
> > + while (start < end) {
> > + unsigned long next = pud_addr_end(start, end);
> > +
> > + if (!pud_none_or_clear_bad(pud))
> > + vma_wrprotect_pud_range(vma, pud, start, next);
> > +
> > + ++pud;
> > + start = next;
> > + }
> > +}
> > +
> > +static void vma_wrprotect(struct vm_area_struct *vma)
> > +{
> > + unsigned long addr = vma->vm_start;
> > + pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
> > +
> > + while (addr < vma->vm_end) {
> > + unsigned long next = pgd_addr_end(addr, vma->vm_end);
> > +
> > + if (!pgd_none_or_clear_bad(pgd))
> > + vma_wrprotect_pgd_range(vma, pgd, addr, next);
> > +
> > + ++pgd;
> > + addr = next;
> > + }
> > +}
>
> I think you want to pass start, end here too, you might not need to
> sweep the whole vma.

Thanks for you feedback, Peter!

I will redesign the vma_wrprotect_pmd_range() routine the way it
acquires the spinlock outside of the loop. I will also rewrite the
vma_wrprotect() function to process only the specified range.

>
>
>

2008-01-23 13:09:22

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

2008/1/23, Miklos Szeredi <[email protected]>:
> > > Also, it still doesn't make sense to me why we'd not need to walk the
> > > rmap, it is all the same file after all.
> >
> > It's the same file, but not the same memory map. It basically depends
> > on how you define msync:
> >
> > a) sync _file_ on region defined by this mmap/start/end-address
> > b) sync _memory_region_ defined by start/end-address
>
> My mmap/msync tester program can acually check this as well, with the
> '-f' flag. Anton, can you try that on the reference platforms?

Here it is:

$ ./a.out file -f
begin 1201085546 1201085546 1200956936
write 1201085546 1201085546 1200956936
mmap 1201085546 1201085546 1200956936
b 1201085546 1201085546 1200956936
msync b 1201085550 1201085550 1200956936
c 1201085550 1201085550 1200956936
msync c 1201085552 1201085552 1200956936
d 1201085552 1201085552 1200956936
munmap 1201085552 1201085552 1200956936
close 1201085555 1201085555 1200956936
sync 1201085555 1201085555 1200956936
$ ./a.out file -sf
begin 1201085572 1201085572 1200956936
write 1201085572 1201085572 1200956936
mmap 1201085572 1201085572 1200956936
b 1201085572 1201085572 1200956936
msync b 1201085576 1201085576 1200956936
c 1201085576 1201085576 1200956936
msync c 1201085578 1201085578 1200956936
d 1201085578 1201085578 1200956936
munmap 1201085578 1201085578 1200956936
close 1201085581 1201085581 1200956936
sync 1201085581 1201085581 1200956936
$ uname -a
FreeBSD td152.testdrive.hp.com 6.2-RELEASE FreeBSD 6.2-RELEASE #0: Fri
Jan 12 11:05:30 UTC 2007
[email protected]:/usr/obj/usr/src/sys/SMP i386
$

>
> Miklos
>

2008-01-23 13:55:48

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

>
> Miklos, thanks for your program. Its output is given below.
>
> debian:~/miklos# mount | grep mnt
> tmpfs on /mnt type tmpfs (rw)
> debian:~/miklos# ./miklos /mnt/file
> begin 1201089989 1201089989 1201085868
> write 1201089990 1201089990 1201085868
> mmap 1201089990 1201089990 1201089991
> store b 1201089992 1201089992 1201089991
> msync 1201089992 1201089992 1201089991
> store c 1201089994 1201089994 1201089991
> msync 1201089994 1201089994 1201089991
> store d 1201089996 1201089996 1201089991
> munmap 1201089996 1201089996 1201089991
> close 1201089996 1201089996 1201089991
> sync 1201089996 1201089996 1201089991
> debian:~/miklos# ./miklos /mnt/file -r
> begin 1201090025 1201090025 1201089991
> write 1201090026 1201090026 1201089991
> mmap 1201090026 1201090026 1201090027
> fetch x 1201090026 1201090026 1201090027
> store b 1201090026 1201090026 1201090027
> msync 1201090026 1201090026 1201090027
> fetch y 1201090026 1201090026 1201090027
> store c 1201090032 1201090032 1201090027
> msync 1201090032 1201090032 1201090027
> fetch z 1201090032 1201090032 1201090027
> store d 1201090036 1201090036 1201090027
> munmap 1201090036 1201090036 1201090027
> close 1201090036 1201090036 1201090027
> sync 1201090036 1201090036 1201090027
> debian:~/miklos#
>
> I think that after the patches applied the msync() system call does
> everything, which it is expected to do. The issue you're now talking
> about is one belonging to do_fsync() and the design of the tmpfs
> driver itself, I believe.

Right. My problem is that msync() does _too_much_ for ram-backed
mappings. It shouldn't write protect pages in this case, because it
has a high cost and dubious value.

Similar argument probably holds for file_update_time() from the fault
handlers. They should examine, if it's a ram-backed fs (with
mapping_cap_account_dirty()) and not update the times for those.

It's better to consistently not update the times, than to randomly do
so, which could lead to false expectations, and hard to track down
bugs.

Miklos

2008-01-23 17:07:45

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
> +
> + if (pte_dirty(*pte) && pte_write(*pte)) {

Not correct.

You still need to check "pte_present()" before you can test any other
bits. For a non-present pte, none of the other bits are defined, and for
all we know there might be architectures out there that require them to
be non-dirty.

As it is, you just possibly randomly corrupted the pte.

Yeah, on all architectures I know of, it the pte is clear, neither of
those tests will trigger, so it just happens to work, but it's still
wrong. And for a MAP_SHARED mapping, it should be either clear or valid,
although I can imagine that we might do swap-cache entries for tmpfs or
something (in which case trying to clear the write-enable bit would
corrupt the swap entry!).

So the bug might be hard or even impossible to trigger in practice, but
it's still wrong.

I realize that "page_mkclean_one()" doesn't do this very obviously, but
it's actually there (it's just hidden in page_check_address()).

Quite frankly, at this point I'm getting *very* tired of this series.
Especially since you ignored me when I suggested you just revert the
commit that removed the page table walking - and instead send in a buggy
patch.

Yes, the VM is hard. I agree. It's nasty. But exactly because it's nasty
and subtle and horrid, I'm also very anal about it, and I get really
nervous when somebody touches it without (a) knowing all the rules
intimately and (b) listening to people who do.

So here's even a patch to get you started. Do this:

git revert 204ec841fbea3e5138168edbc3a76d46747cc987

and then use this appended patch on top of that as a starting point for
something that compiles and *possibly* works.

And no, I do *not* guarantee that this is right either! I have not tested
it or thought about it a lot, and S390 tends to be odd about some of these
things. In particular, I actually suspect that we should possibly do this
the same way we do

ptep_clear_flush_young()

except we would do "ptep_clear_flush_wrprotect()". So even though this is
a revert plus a simple patch to make it compile again (we've changed how
we do dirty bits), I think a patch like this needs testing and other
people like Nick and Peter to ack it.

Nick? Peter? Testing? Other comments?

Linus

---
mm/msync.c | 9 ++++++---
1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index a30487f..9b0af8f 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -32,6 +32,7 @@ static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
again:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
+ pte_t entry;
struct page *page;

if (progress >= 64) {
@@ -47,9 +48,11 @@ again:
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
- if (ptep_clear_flush_dirty(vma, addr, pte) ||
- page_test_and_clear_dirty(page))
- ret += set_page_dirty(page);
+ entry = ptep_clear_flush(vma, addr, pte);
+ entry = pte_wrprotect(entry);
+ set_pte_at(mm, address, pte, entry);
+
+ ret += 1;
progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(pte - 1, ptl);

2008-01-23 19:36:53

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Peter Zijlstra wrote:
>
> It would need some addition piece to not call msync_interval() for
> MS_SYNC, and remove the balance_dirty_pages_ratelimited_nr() stuff.
>
> But yeah, this pte walker is much better.

Actually, I think this patch is much better.

Anyway, it's better because:
- it actually honors the range
- it uses the same code for MS_ASYNC and MS_SYNC
- it just avoids doing the "wait for" for MS_ASYNC.

However, it's totally untested, of course. What did you expect? Clean code
_and_ testing?

[ Side note: it is quite possible that we should not do the
SYNC_FILE_RANGE_WAIT_BEFORE on MS_ASYNC, and just skip over pages that
are busily under writeback already.

Whatever.

There are probably other problems here too, so consider this a "Hey,
wouldn't something like this work really well?" patch rather than
something final. ]

Just to get peoples creative juices going, here's my suggested patch.

Linus

---
mm/msync.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/mm/msync.c b/mm/msync.c
index 144a757..a7a2ea4 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -10,10 +10,37 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/sched.h>

+static int msync_range(struct file *file, loff_t start, unsigned long len, unsigned int sync)
+{
+ int ret;
+ struct address_space *mapping = file->f_mapping;
+ loff_t end = start + len - 1;
+
+ ret = do_sync_mapping_range(mapping, start, end,
+ SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_BEFORE);
+
+ if (ret || !sync)
+ return ret;
+
+ if (file->f_op && file->f_op->fsync) {
+ mutex_lock(&mapping->host->i_mutex);
+ ret = file->f_op->fsync(file, file->f_path.dentry, 0);
+ mutex_unlock(&mapping->host->i_mutex);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ return wait_on_page_writeback_range(mapping,
+ start >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+}
+
/*
* MS_SYNC syncs the entire file - including mappings.
*
@@ -77,18 +104,35 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
goto out_unlock;
}
file = vma->vm_file;
- start = vma->vm_end;
- if ((flags & MS_SYNC) && file &&
- (vma->vm_flags & VM_SHARED)) {
+
+ if (file && (vma->vm_flags & VM_SHARED)) {
+ loff_t offset;
+ unsigned long len;
+
+ /*
+ * We need to do all of this before we release the mmap_sem,
+ * since "vma" isn't available after that.
+ */
+ offset = start - vma->vm_start;
+ offset += vma->vm_pgoff << PAGE_SHIFT;
+ len = end;
+ if (len > vma->vm_end)
+ len = vma->vm_end;
+ len -= start;
+
+ /* Update start here, since vm_end will be gone too.. */
+ start = vma->vm_end;
get_file(file);
up_read(&mm->mmap_sem);
- error = do_fsync(file, 0);
+
+ error = msync_range(file, offset, len, flags & MS_SYNC);
fput(file);
if (error || start >= end)
goto out;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
} else {
+ start = vma->vm_end;
if (start >= end) {
error = 0;
goto out_unlock;

2008-01-23 19:55:52

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> >
> > It would need some addition piece to not call msync_interval() for
> > MS_SYNC, and remove the balance_dirty_pages_ratelimited_nr() stuff.
> >
> > But yeah, this pte walker is much better.
>
> Actually, I think this patch is much better.
>
> Anyway, it's better because:
> - it actually honors the range
> - it uses the same code for MS_ASYNC and MS_SYNC
> - it just avoids doing the "wait for" for MS_ASYNC.
>
> However, it's totally untested, of course. What did you expect? Clean code
> _and_ testing?
>
> [ Side note: it is quite possible that we should not do the
> SYNC_FILE_RANGE_WAIT_BEFORE on MS_ASYNC, and just skip over pages that
> are busily under writeback already.

MS_ASYNC is not supposed to wait, so SYNC_FILE_RANGE_WAIT_BEFORE
probably should not be used in that case.

What would be perfect, is if we had a sync mode, that on encountering
a page currently under writeback, would just do a page_mkclean() on
it, so we still receive a page fault next time one of the mappings is
dirtied, so the times can be updated.

Would there be any difficulties with that?

Miklos

2008-01-23 18:04:40

[permalink] [raw]

Subject: Re: [PATCH -v8 2/4] Update ctime and mtime for memory-mapped files

On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
>
> Update ctime and mtime for memory-mapped files at a write access on
> a present, read-only PTE, as well as at a write on a non-present PTE.

Ok, this one I'm applying. I agree that it leaves MS_ASYNC not updating
the file until the next sync actually happens, but I can't really bring
myself to care at least for an imminent 2.6.24 thing. The file times are
actually "correct" in the sense that they will now match when the IO is
done, and my man-page says that MS_ASYNC "schedules the io to be done".

And I think this is better than we have now, and I don't think this part
is somethign that anybody really disagrees with.

We can (and should) keep the MS_ASYNC issue open.

Linus

2008-01-23 17:51:18

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 2008-01-23 at 09:05 -0800, Linus Torvalds wrote:

> So here's even a patch to get you started. Do this:
>
> git revert 204ec841fbea3e5138168edbc3a76d46747cc987
>
> and then use this appended patch on top of that as a starting point for
> something that compiles and *possibly* works.
>
> And no, I do *not* guarantee that this is right either! I have not tested
> it or thought about it a lot, and S390 tends to be odd about some of these
> things. In particular, I actually suspect that we should possibly do this
> the same way we do
>
> ptep_clear_flush_young()
>
> except we would do "ptep_clear_flush_wrprotect()". So even though this is
> a revert plus a simple patch to make it compile again (we've changed how
> we do dirty bits), I think a patch like this needs testing and other
> people like Nick and Peter to ack it.
>
> Nick? Peter? Testing? Other comments?

It would need some addition piece to not call msync_interval() for
MS_SYNC, and remove the balance_dirty_pages_ratelimited_nr() stuff.

But yeah, this pte walker is much better.

As for s390, I think they only differ on the dirty bit, and we should
not be touching that.

2008-01-23 17:32:49

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

2008/1/23, Linus Torvalds <[email protected]>:
>
>
> On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
> > +
> > + if (pte_dirty(*pte) && pte_write(*pte)) {
>
> Not correct.
>
> You still need to check "pte_present()" before you can test any other
> bits. For a non-present pte, none of the other bits are defined, and for
> all we know there might be architectures out there that require them to
> be non-dirty.
>
> As it is, you just possibly randomly corrupted the pte.
>
> Yeah, on all architectures I know of, it the pte is clear, neither of
> those tests will trigger, so it just happens to work, but it's still
> wrong. And for a MAP_SHARED mapping, it should be either clear or valid,
> although I can imagine that we might do swap-cache entries for tmpfs or
> something (in which case trying to clear the write-enable bit would
> corrupt the swap entry!).
>
> So the bug might be hard or even impossible to trigger in practice, but
> it's still wrong.
>
> I realize that "page_mkclean_one()" doesn't do this very obviously, but
> it's actually there (it's just hidden in page_check_address()).
>
> Quite frankly, at this point I'm getting *very* tired of this series.
> Especially since you ignored me when I suggested you just revert the
> commit that removed the page table walking - and instead send in a buggy
> patch.
>
> Yes, the VM is hard. I agree. It's nasty. But exactly because it's nasty
> and subtle and horrid, I'm also very anal about it, and I get really
> nervous when somebody touches it without (a) knowing all the rules
> intimately and (b) listening to people who do.
>
> So here's even a patch to get you started. Do this:
>
> git revert 204ec841fbea3e5138168edbc3a76d46747cc987
>
> and then use this appended patch on top of that as a starting point for
> something that compiles and *possibly* works.
>
> And no, I do *not* guarantee that this is right either! I have not tested
> it or thought about it a lot, and S390 tends to be odd about some of these
> things. In particular, I actually suspect that we should possibly do this
> the same way we do
>
> ptep_clear_flush_young()
>
> except we would do "ptep_clear_flush_wrprotect()". So even though this is
> a revert plus a simple patch to make it compile again (we've changed how
> we do dirty bits), I think a patch like this needs testing and other
> people like Nick and Peter to ack it.

I'm very sorry for my bad code which can not pass LKML's review.

I reassigned the bug #2645 to default assignee, Andrew Morton, because
it seems that people start getting tired of my patch series.

Thanks for your support.

>
> Nick? Peter? Testing? Other comments?
>
> Linus
>
> ---
> mm/msync.c | 9 ++++++---
> 1 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/mm/msync.c b/mm/msync.c
> index a30487f..9b0af8f 100644
> --- a/mm/msync.c
> +++ b/mm/msync.c
> @@ -32,6 +32,7 @@ static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> again:
> pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> do {
> + pte_t entry;
> struct page *page;
>
> if (progress >= 64) {
> @@ -47,9 +48,11 @@ again:
> page = vm_normal_page(vma, addr, *pte);
> if (!page)
> continue;
> - if (ptep_clear_flush_dirty(vma, addr, pte) ||
> - page_test_and_clear_dirty(page))
> - ret += set_page_dirty(page);
> + entry = ptep_clear_flush(vma, addr, pte);
> + entry = pte_wrprotect(entry);
> + set_pte_at(mm, address, pte, entry);
> +
> + ret += 1;
> progress += 3;
> } while (pte++, addr += PAGE_SIZE, addr != end);
> pte_unmap_unlock(pte - 1, ptl);
>

2008-01-23 21:02:32

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Miklos Szeredi wrote:
>
> > [ Side note: it is quite possible that we should not do the
> > SYNC_FILE_RANGE_WAIT_BEFORE on MS_ASYNC, and just skip over pages that
> > are busily under writeback already.
>
> MS_ASYNC is not supposed to wait, so SYNC_FILE_RANGE_WAIT_BEFORE
> probably should not be used in that case.

Well, that would leave the page dirty (including in the page tables) if it
was under page writeback when the MS_ASYNC happened.

So I agree, we shouldn't necessarily wait, but if we want the page tables
to be cleaned, right now we need to.

> What would be perfect, is if we had a sync mode, that on encountering
> a page currently under writeback, would just do a page_mkclean() on
> it, so we still receive a page fault next time one of the mappings is
> dirtied, so the times can be updated.
>
> Would there be any difficulties with that?

It would require fairly invasive changes. Right now the actual page
writeback does effectively:

...
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);

if (PageWriteback(page) ||
!clear_page_dirty_for_io(page)) {
unlock_page(page);
continue;
}

ret = (*writepage)(page, wbc, data);
...

and that "clear_page_dirty_for_io()" really does clear *all* the dirty
bits, so we absolutely must start writepage() when we have done that. And
that, in turn, requires that we're not already under writeback.

Is it possible to fix? Sure. We'd have to split up
clear_page_dirty_for_io() to do it, and do the

if (mapping && mapping_cap_account_dirty(mapping))
..

part first (before the PageWriteback() tests), and then doing the

if (TestClearPageDirty(page))
...

parts later (after checking that that we're not under page-writeback).

So it's not horribly hard, but it's kind of a separate issue right now.
And while the *generic* page-writeback is easy enough to fix, I worry
about low-level filesystems that have their own "writepages()"
implementation. They could easily get that wrong.

So right now it seems that waiting for writeback to finish is the right
and safe thing to do (and even so, I'm not actually willing to commit my
suggested patch in 2.6.24, I think this needs more thinking about)

Linus

2008-01-23 21:16:51

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> So it's not horribly hard, but it's kind of a separate issue right now.
> And while the *generic* page-writeback is easy enough to fix, I worry
> about low-level filesystems that have their own "writepages()"
> implementation. They could easily get that wrong.

Yeah, nasty.

How about doing it in a separate pass, similarly to
wait_on_page_writeback()? Just instead of waiting, clean the page
tables for writeback pages.

> So right now it seems that waiting for writeback to finish is the right
> and safe thing to do (and even so, I'm not actually willing to commit my
> suggested patch in 2.6.24, I think this needs more thinking about)

Sure, I would have though all of this stuff is 2.6.25, but it's your
kernel... :)

Miklos

2008-01-23 21:38:18

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Miklos Szeredi wrote:
>
> Yeah, nasty.
>
> How about doing it in a separate pass, similarly to
> wait_on_page_writeback()? Just instead of waiting, clean the page
> tables for writeback pages.

That sounds like a good idea, but it doesn't work.

The thing is, we need to hold the page-table lock over the whole sequence
of

if (page_mkclean(page))
set_page_dirty(page);
if (TestClearPageDirty(page))
..

and there's a big comment about why in clear_page_dirty_for_io().

So if you split it up, so that the first phase is that

if (page_mkclean(page))
set_page_dirty(page);

and the second phase is the one that just does a

if (TestClearPageDirty(page))
writeback(..)

and having dropped the page lock in between, then you lose: because
another thread migth have faulted in and re-dirtied the page table entry,
and you MUST NOT do that "TestClearPageDirty()" in that case!

That dirty bit handling is really really important, and it's sadly also
really really easy to get wrong (usually in ways that are hard to even
notice: things still work 99% of the time, and you might just be leaking
memory slowly, and fsync/msync() might not write back memory mapped data
to disk at all etc).

> Sure, I would have though all of this stuff is 2.6.25, but it's your
> kernel... :)

Well, the plain added "file_update_time()" call addition looked like a
trivial fix, and if there are actually *customers* that have bad backups
due to this, then I think that part was worth doing. At least a "sync"
will then sync the file times...

Linus

2008-01-23 22:32:25

by Hugh Dickins

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Linus Torvalds wrote:
> On Wed, 23 Jan 2008, Miklos Szeredi wrote:
>
> > Sure, I would have though all of this stuff is 2.6.25, but it's your
> > kernel... :)
>
> Well, the plain added "file_update_time()" call addition looked like a
> trivial fix, and if there are actually *customers* that have bad backups
> due to this, then I think that part was worth doing. At least a "sync"
> will then sync the file times...

Fair enough.

Something I dislike about it, though, is that it leaves the RAM-backed
filesystems (ramfs, tmpfs, whatever) behaving visibly differently from
the others. Until now we've intentionally left them out of syncing and
dirty accounting, because it's useless overhead for them (one can argue
whether that's quite true of tmpfs overflowing out to swap, but that's
a different debate). So they won't be getting these faults on shared
writable, so their file times won't get updated in the same way.

But I guess that's an aesthetic consideration, of less significance
than bad backups - assuming not many people use backups of tmpfs.

Hugh

2008-01-23 22:44:04

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Hugh Dickins wrote:
>
> Something I dislike about it, though, is that it leaves the RAM-backed
> filesystems (ramfs, tmpfs, whatever) behaving visibly differently from
> the others.

I hear you.

But I'm not seeing many alternatives, unless we start taking write faults
on them unnecessarily. Do we care? Probably not really.

So we certainly *could* make ramfs/tmpfs claim they do dirty accounting,
but just having a no-op writeback. Without that, they'd need something
really special in the file time updates.

Personally, I don't really see anybody really caring one way or the other,
but who knows..

Linus

2008-01-23 23:14:28

[permalink] [raw]

Subject: Re: [PATCH -v8 2/4] Update ctime and mtime for memory-mapped files

2008/1/23, Linus Torvalds <[email protected]>:
>
>
> On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
> >
> > Update ctime and mtime for memory-mapped files at a write access on
> > a present, read-only PTE, as well as at a write on a non-present PTE.
>
> Ok, this one I'm applying. I agree that it leaves MS_ASYNC not updating
> the file until the next sync actually happens, but I can't really bring
> myself to care at least for an imminent 2.6.24 thing. The file times are
> actually "correct" in the sense that they will now match when the IO is
> done, and my man-page says that MS_ASYNC "schedules the io to be done".
>
> And I think this is better than we have now, and I don't think this part
> is somethign that anybody really disagrees with.
>
> We can (and should) keep the MS_ASYNC issue open.

Thank you!

I have closed the bug #2645, because this patch solves the issue
originally reported.

>
> Linus
>

2008-01-24 00:05:34

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

> > How about doing it in a separate pass, similarly to
> > wait_on_page_writeback()? Just instead of waiting, clean the page
> > tables for writeback pages.
>
> That sounds like a good idea, but it doesn't work.
>
> The thing is, we need to hold the page-table lock over the whole sequence
> of
>
> if (page_mkclean(page))
> set_page_dirty(page);
> if (TestClearPageDirty(page))
> ..
>
> and there's a big comment about why in clear_page_dirty_for_io().
>
> So if you split it up, so that the first phase is that
>
> if (page_mkclean(page))
> set_page_dirty(page);
>
> and the second phase is the one that just does a
>
> if (TestClearPageDirty(page))
> writeback(..)
>
> and having dropped the page lock in between, then you lose: because
> another thread migth have faulted in and re-dirtied the page table entry,
> and you MUST NOT do that "TestClearPageDirty()" in that case!
>
> That dirty bit handling is really really important, and it's sadly also
> really really easy to get wrong (usually in ways that are hard to even
> notice: things still work 99% of the time, and you might just be leaking
> memory slowly, and fsync/msync() might not write back memory mapped data
> to disk at all etc).

OK.

But I still think this approach should work. Untested, might eat your
children, so please don't apply to any kernel.

Miklos

Index: linux/mm/msync.c
===================================================================
--- linux.orig/mm/msync.c 2008-01-24 00:18:31.000000000 +0100
+++ linux/mm/msync.c 2008-01-24 00:50:37.000000000 +0100
@@ -10,9 +10,91 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/backing-dev.h>
+
+static void mkclean_writeback_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct pagevec pvec;
+ pgoff_t index;
+
+ if (!mapping_cap_account_dirty(mapping))
+ return;
+
+ if (end < start)
+ return;
+
+ pagevec_init(&pvec, 0);
+ index = start;
+ while (index <= end) {
+ unsigned i;
+ int nr_pages = min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK,
+ nr_pages);
+ if (!nr_pages)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /* until radix tree lookup accepts end_index */
+ if (page->index > end)
+ continue;
+
+ lock_page(page);
+ if (page_mkclean(page))
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+}
+
+static int msync_range(struct file *file, loff_t start, unsigned long len, unsigned int sync)
+{
+ int ret;
+ struct address_space *mapping = file->f_mapping;
+ loff_t end = start + len - 1;
+ int sync_flags = SYNC_FILE_RANGE_WRITE;
+
+ if (sync) {
+ sync_flags |= SYNC_FILE_RANGE_WAIT_BEFORE;
+ } else {
+ /*
+ * For MS_ASYNC, don't wait for writback already in
+ * progress, but instead just clean the page tables.
+ */
+ mkclean_writeback_pages(mapping,
+ start >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+ }
+
+ ret = do_sync_mapping_range(mapping, start, end, sync_flags);
+ if (ret || !sync)
+ return ret;
+
+ if (file->f_op && file->f_op->fsync) {
+ mutex_lock(&mapping->host->i_mutex);
+ ret = file->f_op->fsync(file, file->f_path.dentry, 0);
+ mutex_unlock(&mapping->host->i_mutex);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ return wait_on_page_writeback_range(mapping,
+ start >> PAGE_CACHE_SHIFT,
+ end >> PAGE_CACHE_SHIFT);
+}

/*
* MS_SYNC syncs the entire file - including mappings.
@@ -77,18 +159,36 @@ asmlinkage long sys_msync(unsigned long
goto out_unlock;
}
file = vma->vm_file;
- start = vma->vm_end;
- if ((flags & MS_SYNC) && file &&
- (vma->vm_flags & VM_SHARED)) {
+
+ if (file && (vma->vm_flags & VM_SHARED) &&
+ (flags & (MS_SYNC | MS_ASYNC))) {
+ loff_t offset;
+ unsigned long len;
+
+ /*
+ * We need to do all of this before we release the mmap_sem,
+ * since "vma" isn't available after that.
+ */
+ offset = start - vma->vm_start;
+ offset += vma->vm_pgoff << PAGE_SHIFT;
+ len = end;
+ if (len > vma->vm_end)
+ len = vma->vm_end;
+ len -= start;
+
+ /* Update start here, since vm_end will be gone too.. */
+ start = vma->vm_end;
get_file(file);
up_read(&mm->mmap_sem);
- error = do_fsync(file, 0);
+
+ error = msync_range(file, offset, len, flags & MS_SYNC);
fput(file);
if (error || start >= end)
goto out;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
} else {
+ start = vma->vm_end;
if (start >= end) {
error = 0;
goto out_unlock;

2008-01-24 00:13:07

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Thu, 24 Jan 2008, Miklos Szeredi wrote:
>
> But I still think this approach should work. Untested, might eat your
> children, so please don't apply to any kernel.

Yes, something like that should work.

The important part is to not do the "TestClearPageDirty()" separately, the
rest of it can be done well enough.

Linus

2008-01-24 00:17:19

by Hugh Dickins

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Wed, 23 Jan 2008, Linus Torvalds wrote:
>
> So we certainly *could* make ramfs/tmpfs claim they do dirty accounting,
> but just having a no-op writeback. Without that, they'd need something
> really special in the file time updates.

What we might reasonably choose to end up doing there (in 2.6.25)
is sending tmpfs etc. through the extra faulting for linked files,
but skipping it as at present for unlinked files i.e. shared memory
would continue to skip the extra faults, shared memory being the case we
really wanted to avoid the overhead on when dirty page accounting came in.

> Personally, I don't really see anybody really caring one way or the other,
> but who knows..

I care a bit, because I don't like to feel that tmpfs is now left
saddled with the bug that every filesystem has had for years before.
I'll need to compare the small performance cost of fixing it against
the unease of leaving it alone.

Hugh

2008-01-24 01:36:19

by Nick Piggin

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Thursday 24 January 2008 04:05, Linus Torvalds wrote:
> On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
> > +
> > + if (pte_dirty(*pte) && pte_write(*pte)) {
>
> Not correct.
>
> You still need to check "pte_present()" before you can test any other
> bits. For a non-present pte, none of the other bits are defined, and for
> all we know there might be architectures out there that require them to
> be non-dirty.
>
> As it is, you just possibly randomly corrupted the pte.
>
> Yeah, on all architectures I know of, it the pte is clear, neither of
> those tests will trigger, so it just happens to work, but it's still
> wrong.

Probably it can fail for !present nonlinear mappings on many
architectures.

2008-01-24 18:57:59

by Matt Mackall

[permalink] [raw]

Subject: Re: [PATCH -v8 3/4] Enable the MS_ASYNC functionality in sys_msync()

On Thu, 2008-01-24 at 12:36 +1100, Nick Piggin wrote:
> On Thursday 24 January 2008 04:05, Linus Torvalds wrote:
> > On Wed, 23 Jan 2008, Anton Salikhmetov wrote:
> > > +
> > > + if (pte_dirty(*pte) && pte_write(*pte)) {
> >
> > Not correct.
> >
> > You still need to check "pte_present()" before you can test any other
> > bits. For a non-present pte, none of the other bits are defined, and for
> > all we know there might be architectures out there that require them to
> > be non-dirty.
> >
> > As it is, you just possibly randomly corrupted the pte.
> >
> > Yeah, on all architectures I know of, it the pte is clear, neither of
> > those tests will trigger, so it just happens to work, but it's still
> > wrong.
>
> Probably it can fail for !present nonlinear mappings on many
> architectures.

Definitely.

--
Mathematics is the supreme nostalgia of our time.

2008-01-25 16:29:30

by Randy Dunlap

[permalink] [raw]

Subject: Re: [PATCH -v8 4/4] The design document for memory-mapped file times update

On Wed, 23 Jan 2008 02:21:20 +0300 Anton Salikhmetov wrote:

> Add a document, which describes how the POSIX requirements on updating
> memory-mapped file times are addressed in Linux.

Hi Anton,

Just a few small comments below...

> Signed-off-by: Anton Salikhmetov <[email protected]>
> ---
> Documentation/vm/00-INDEX | 2 +
> Documentation/vm/msync.txt | 117 ++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 119 insertions(+), 0 deletions(-)
>
> diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
> index 2131b00..2726c8d 100644
> --- a/Documentation/vm/00-INDEX
> +++ b/Documentation/vm/00-INDEX
> @@ -6,6 +6,8 @@ hugetlbpage.txt
> - a brief summary of hugetlbpage support in the Linux kernel.
> locking
> - info on how locking and synchronization is done in the Linux vm code.
> +msync.txt
> + - the design document for memory-mapped file times update
> numa
> - information about NUMA specific code in the Linux vm.
> numa_memory_policy.txt
> diff --git a/Documentation/vm/msync.txt b/Documentation/vm/msync.txt
> new file mode 100644
> index 0000000..571a766
> --- /dev/null
> +++ b/Documentation/vm/msync.txt
> @@ -0,0 +1,117 @@
> +
> + The msync() system call and memory-mapped file times
> +
> + Copyright (C) 2008 Anton Salikhmetov
> +
> +The POSIX standard requires that any write reference to memory-mapped file
> +data should result in updating the ctime and mtime for that file. Moreover,
> +the standard mandates that updated file times should become visible to the
> +world no later than at the next call to msync().
> +
> +Failure to meet this requirement creates difficulties for certain classes
> +of important applications. For instance, database backup systems fail to
> +pick up the files modified via the mmap() interface. Also, this is a
> +security hole, which allows forging file data in such a manner that proving
> +the fact that file data was modified is not possible.
> +
> +Briefly put, this requirement can be stated as follows:
> +
> + once the file data has changed, the operating system
> + should acknowledge this fact by updating file metadata.
> +
> +This document describes how this POSIX requirement is addressed in Linux.
> +
> +1. Requirements
> +
> +1.1) the POSIX standard requires updating ctime and mtime not later
> +than at the call to msync() with MS_SYNC or MS_ASYNC flags;
> +
> +1.2) in existing POSIX implementations, ctime and mtime
> +get updated not later than at the call to fsync();
> +
> +1.3) in existing POSIX implementation, ctime and mtime
> +get updated not later than at the call to sync(), the "auto-update" feature;
> +
> +1.4) the customers require and the common sense suggests that
> +ctime and mtime should be updated not later than at the call to munmap()
> +or exit(), the latter function implying an implicit call to munmap();
> +
> +1.5) the (1.1) item should be satisfied if the file is a block device
> +special file;
> +
> +1.6) the (1.1) item should be satisfied for files residing on
> +memory-backed filesystems such as tmpfs, too.
> +
> +The following operating systems were used as the reference platforms
> +and are referred to as the "existing implementations" above:
> +HP-UX B.11.31 and FreeBSD 6.2-RELEASE.
> +
> +2. Lazy update
> +
> +Many attempts before the current version implemented the "lazy update" approach
> +to satisfying the requirements given above. Within the latter approach, ctime
> +and mtime get updated at last moment allowable.
> +
> +Since we don't update the file times immediately, some Flag has to be
> +used. When up, this Flag means that the file data was modified and
> +the file times need to be updated as soon as possible.

I would s/up/set/ above and below.

> +Any existing "dirty" flag which, when up, mean that a page has been written to,

s/mean/means/

> +is not suitable for this purpose. Indeed, msync() called with MS_ASYNC
> +would have to reset this "dirty" flag after updating ctime and mtime.
> +The sys_msync() function itself is basically a no-op in the MS_ASYNC case.
> +Thereby, the synchronization routines relying upon this "dirty" flag
> +would lose data. Therefore, a new Flag has to be introduced.
> +
> +The (1.5) item coupled with (1.3) requirement leads to hard work with
> +the block device inodes. Specifically, during writeback it is impossible to
> +tell which block device file was originally mapped. Therefore, we need to
> +traverse the list of "active" devices associated with the block device inode.
> +This would lead to updating file times for block device files, which were not
> +taking part in the data transfer.
> +
> +Also all versions prior to version 6 failed to correctly process ctime and
> +mtime for files on the memory-backed filesystems such as tmpfs. So the (1.6)
> +requirement was not satisfied.
> +
> +If a write reference has occurred between two consecutive calls to msync()
> +with MS_ASYNC, the second call to the latter function should take into
> +account the last write reference. The last write reference can not be caught

s/can not/cannot/

> +if no pagefault occurs. Hence a pagefault needs to be forced. This can be done
> +using two different approaches. The first one is to synchronize data even when
> +msync() was called with MS_ASYNC. This is not acceptable because the current
> +design of the sys_msync() routine forbids starting I/O for the MS_ASYNC case.
> +The second approach is to write protect the page for triggering a pagefault

s/write protect/write-protect/

> +at the next write reference. Note that the dirty flag for the page should not
> +be cleared thereby.
> +
> +In the "lazy update" approach, the requirements (1.1), (1.2), (1.3), and (1.4)
> +taken together result in adding code at least to the following kernel routines:
> +sys_msync(), do_fsync(), some routine in the unmap() call path, some routine
> +in the sync() call path.
> +
> +Finally, a file_update_time()-like function would have to be created for
> +processing the inode objects, not file objects. This is due to the fact that
> +during the sync() operation, the file object may not exist any more, only
> +the inode is known.
> +
> +To sum up: this "lazy" approach leads to massive changes, incurs overhead in
> +the block device case, and requires complicated design decisions.
> +
> +3. Immediate update
> +
> +OK, still reading? There's a better way.
> +
> +In a fashion analogous to what happens at write(2), react to the fact
> +that the page gets dirtied by updating the file times immediately.
> +Thereby any page writeback happens when the write reference has already
> +been accounted for from the view point of file times.

Probably s/view point/viewpoint/.

> +
> +The only problem which remains is to force refreshing file times at the write
> +reference following a call to msync() with MS_ASYNC. As mentioned above, all
> +that is needed here is to force a pagefault.
> +
> +The vma_wrprotect() routine introduced in this patch series is called
> +from sys_msync() in the MS_ASYNC case. The former routine is essentially
> +a version of existing page_mkclean_one() function from mm/rmap.c. Unlike
> +the latter function, the vma_wrprotect() does not touch the dirty bit.
> --

Thanks for the design document.
---
~Randy

2008-01-25 16:40:25