LinuxLists.cc - [PATCH][1/2]page_fault retry with NOPAGE

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> support for FAULT_FLAG_RETRY with no user change:

A better changelog is desired, otherwise:

Signed-off-by: Wu Fengguang <[email protected]>

> Signed-off-by: Ying Han <[email protected]>
> Mike Waychison <[email protected]>
>
> include/linux/fs.h | 2 +-
> include/linux/mm.h | 2 +
> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> mm/memory.c | 33 +++++++++++++++++------
>
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
> there are only # of pages ahead */
>
> unsigned int ra_pages; /* Maximum readahead window */
> - int mmap_miss; /* Cache miss stat for mmap accesses */
> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> loff_t prev_pos; /* Cache last read() position */
> };
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>
> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>
>
> /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>
> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>
> +#define VM_FAULT_RETRY 0x0010
> #define VM_FAULT_OOM 0x0001
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004

Why not append VM_FAULT_RETRY here, and indent the _OOM line?

> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
> EXPORT_SYMBOL(find_lock_page);
>
> /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> + struct vm_area_struct *vma, struct page **ppage,
> + int retry)
> +{
> + unsigned int ret = 0;
> + struct page *page;
> +
> +repeat:
> + page = find_get_page(mapping, offset);
> + if (page) {
> + if (!retry)
> + lock_page(page);
> + else {
> + if (!trylock_page(page)) {
> + struct mm_struct *mm = vma->vm_mm;
> +
> + up_read(&mm->mmap_sem);
> + wait_on_page_locked(page);
> + down_read(&mm->mmap_sem);
> +
> + page_cache_release(page);
> + return VM_FAULT_RETRY;
> + }
> + }
> + if (unlikely(page->mapping != mapping)) {
> + unlock_page(page);
> + page_cache_release(page);
> + goto repeat;
> + }
> + VM_BUG_ON(page->index != offset);
> + }
> + *ppage = page;
> + return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
> * find_or_create_page - locate or add a pagecache page
> * @mapping: the page's address_space
> * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> pgoff_t size;
> int did_readaround = 0;
> int ret = 0;
> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> + int retry_ret;
>
> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> */
> retry_find:
> page = find_lock_page(mapping, vmf->pgoff);
> +
> /*
> * For sequential accesses, we use the generic readahead logic.
> */
> @@ -1465,7 +1520,13 @@ retry_find:
> if (!page) {
> page_cache_sync_readahead(mapping, ra, file,
> vmf->pgoff, 1);
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;

Please don't relocate the comment...because that will break my
following patches(mainly Linus' filemap cleanups), which will
_heavily_ rework these chunks anyway. And make a hard time for Andrew
to merge them.

> + return retry_ret;
> + }

> if (!page)
> goto no_cached_page;
> }
> @@ -1504,7 +1565,14 @@ retry_find:
> start = vmf->pgoff - ra_pages / 2;
> do_page_cache_readahead(mapping, file, start, ra_pages);
> }
> - page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;

ditto

Thanks,
Fengguang

> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1548,7 +1616,7 @@ no_cached_page:
> * meantime, we'll just come back here and read it again.
> */
> if (error >= 0)
> - goto retry_find;
> + goto retry_find_retry;
>
> /*
> * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> vmf.page = NULL;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> +
> + /* page may be available, but we have to restart the process
> + * because mmap_sem was dropped during the ->fault
> + */
> + if (ret & VM_FAULT_RETRY)
> + return ret;
> +
> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
>
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> {
> pgoff_t pgoff = (((address & PAGE_MASK)
> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> + int write = write_access & ~FAULT_FLAG_RETRY;
> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>
> + flags |= (write_access & FAULT_FLAG_RETRY);
> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
> + int ret;
>
> __set_current_state(TASK_RUNNING);
>
> - count_vm_event(PGFAULT);
> -
> - if (unlikely(is_vm_hugetlb_page(vma)))
> - return hugetlb_fault(mm, vma, address, write_access);
> + if (unlikely(is_vm_hugetlb_page(vma))) {
> + ret = hugetlb_fault(mm, vma, address, write_access);
> + goto out;
> + }
>
> + ret = VM_FAULT_OOM;
> pgd = pgd_offset(mm, address);
> pud = pud_alloc(mm, pgd, address);
> if (!pud)
> - return VM_FAULT_OOM;
> + goto out;
> pmd = pmd_alloc(mm, pud, address);
> if (!pmd)
> - return VM_FAULT_OOM;
> + goto out;
> pte = pte_alloc_map(mm, pmd, address);
> if (!pte)
> - return VM_FAULT_OOM;
> + goto out;
>
> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> + if (!(ret & VM_FAULT_RETRY))
> + count_vm_event(PGFAULT);
> + return ret;
> }
>
> #ifndef __PAGETABLE_PUD_FOLDED

2009-04-09 16:21:50

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <[email protected]> wrote:
> On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
>> support for FAULT_FLAG_RETRY with no user change:
>
> A better changelog is desired, otherwise:
>
> Signed-off-by: Wu Fengguang <[email protected]>
>
>> Signed-off-by: Ying Han <[email protected]>
>> Mike Waychison <[email protected]>
>>
>> include/linux/fs.h | 2 +-
>> include/linux/mm.h | 2 +
>> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
>> mm/memory.c | 33 +++++++++++++++++------
>>
>>
>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>> there are only # of pages ahead */
>>
>> unsigned int ra_pages; /* Maximum readahead window */
>> - int mmap_miss; /* Cache miss stat for mmap accesses */
>> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
>> loff_t prev_pos; /* Cache last read() position */
>> };
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
>> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>>
>>
>> /*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY 0x0010
>> #define VM_FAULT_OOM 0x0001
>> #define VM_FAULT_SIGBUS 0x0002
>> #define VM_FAULT_MAJOR 0x0004
>
> Why not append VM_FAULT_RETRY here, and indent the _OOM line?
>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>> EXPORT_SYMBOL(find_lock_page);
>>
>> /**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> + struct vm_area_struct *vma, struct page **ppage,
>> + int retry)
>> +{
>> + unsigned int ret = 0;
>> + struct page *page;
>> +
>> +repeat:
>> + page = find_get_page(mapping, offset);
>> + if (page) {
>> + if (!retry)
>> + lock_page(page);
>> + else {
>> + if (!trylock_page(page)) {
>> + struct mm_struct *mm = vma->vm_mm;
>> +
>> + up_read(&mm->mmap_sem);
>> + wait_on_page_locked(page);
>> + down_read(&mm->mmap_sem);
>> +
>> + page_cache_release(page);
>> + return VM_FAULT_RETRY;
>> + }
>> + }
>> + if (unlikely(page->mapping != mapping)) {
>> + unlock_page(page);
>> + page_cache_release(page);
>> + goto repeat;
>> + }
>> + VM_BUG_ON(page->index != offset);
>> + }
>> + *ppage = page;
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>> * find_or_create_page - locate or add a pagecache page
>> * @mapping: the page's address_space
>> * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> pgoff_t size;
>> int did_readaround = 0;
>> int ret = 0;
>> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> + int retry_ret;
>>
>> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>> if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> */
>> retry_find:
>> page = find_lock_page(mapping, vmf->pgoff);
>> +
>> /*
>> * For sequential accesses, we use the generic readahead logic.
>> */
>> @@ -1465,7 +1520,13 @@ retry_find:
>> if (!page) {
>> page_cache_sync_readahead(mapping, ra, file,
>> vmf->pgoff, 1);
>> - page = find_lock_page(mapping, vmf->pgoff);
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>
> Please don't relocate the comment...because that will break my
> following patches(mainly Linus' filemap cleanups), which will
> _heavily_ rework these chunks anyway. And make a hard time for Andrew
> to merge them.

then how you force the 80 character lines?
>
>> + return retry_ret;
>> + }
>
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1504,7 +1565,14 @@ retry_find:
>> start = vmf->pgoff - ra_pages / 2;
>> do_page_cache_readahead(mapping, file, start, ra_pages);
>> }
>> - page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + vma, &page, retry_flag);
>> + if (retry_ret == VM_FAULT_RETRY) {
>> + /* counteract the followed retry hit */
>> + ra->mmap_miss++;
>
> ditto
>
> Thanks,
> Fengguang
>
>> + return retry_ret;
>> + }
>> if (!page)
>> goto no_cached_page;
>> }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>> * meantime, we'll just come back here and read it again.
>> */
>> if (error >= 0)
>> - goto retry_find;
>> + goto retry_find_retry;
>>
>> /*
>> * An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>> vmf.page = NULL;
>>
>> ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> + /* page may be available, but we have to restart the process
>> + * because mmap_sem was dropped during the ->fault
>> + */
>> + if (ret & VM_FAULT_RETRY)
>> + return ret;
>> +
>> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>> return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>> {
>> pgoff_t pgoff = (((address & PAGE_MASK)
>> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> + int write = write_access & ~FAULT_FLAG_RETRY;
>> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> + flags |= (write_access & FAULT_FLAG_RETRY);
>> pte_unmap(page_table);
>> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>> }
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>> pud_t *pud;
>> pmd_t *pmd;
>> pte_t *pte;
>> + int ret;
>>
>> __set_current_state(TASK_RUNNING);
>>
>> - count_vm_event(PGFAULT);
>> -
>> - if (unlikely(is_vm_hugetlb_page(vma)))
>> - return hugetlb_fault(mm, vma, address, write_access);
>> + if (unlikely(is_vm_hugetlb_page(vma))) {
>> + ret = hugetlb_fault(mm, vma, address, write_access);
>> + goto out;
>> + }
>>
>> + ret = VM_FAULT_OOM;
>> pgd = pgd_offset(mm, address);
>> pud = pud_alloc(mm, pgd, address);
>> if (!pud)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pmd = pmd_alloc(mm, pud, address);
>> if (!pmd)
>> - return VM_FAULT_OOM;
>> + goto out;
>> pte = pte_alloc_map(mm, pmd, address);
>> if (!pte)
>> - return VM_FAULT_OOM;
>> + goto out;
>>
>> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> + if (!(ret & VM_FAULT_RETRY))
>> + count_vm_event(PGFAULT);
>> + return ret;
>> }
>>
>> #ifndef __PAGETABLE_PUD_FOLDED
>

2009-04-10 00:25:51

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Fri, Apr 10, 2009 at 12:21:25AM +0800, Ying Han wrote:
> On Thu, Apr 9, 2009 at 12:47 AM, Wu Fengguang <[email protected]> wrote:
> > On Thu, Apr 09, 2009 at 04:02:35AM +0800, Ying Han wrote:
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > A better changelog is desired, otherwise:
> >
> > Signed-off-by: Wu Fengguang <[email protected]>
> >
> >> Signed-off-by: Ying Han <[email protected]>
> >> Mike Waychison <[email protected]>
> >>
> >> include/linux/fs.h | 2 +-
> >> include/linux/mm.h | 2 +
> >> mm/filemap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >> mm/memory.c | 33 +++++++++++++++++------
> >>
> >>
> >> diff --git a/include/linux/fs.h b/include/linux/fs.h
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >> there are only # of pages ahead */
> >>
> >> unsigned int ra_pages; /* Maximum readahead window */
> >> - int mmap_miss; /* Cache miss stat for mmap accesses */
> >> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> >> loff_t prev_pos; /* Cache last read() position */
> >> };
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index ffee2f7..5a134a9 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
> >>
> >> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> >> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> >> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
> >>
> >>
> >> /*
> >> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
> >>
> >> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
> >>
> >> +#define VM_FAULT_RETRY 0x0010
> >> #define VM_FAULT_OOM 0x0001
> >> #define VM_FAULT_SIGBUS 0x0002
> >> #define VM_FAULT_MAJOR 0x0004
> >
> > Why not append VM_FAULT_RETRY here, and indent the _OOM line?
> >
> >> diff --git a/mm/filemap.c b/mm/filemap.c
> >> index f3e5f89..6eb7c36 100644
> >> --- a/mm/filemap.c
> >> +++ b/mm/filemap.c
> >> @@ -714,6 +714,58 @@ repeat:
> >> EXPORT_SYMBOL(find_lock_page);
> >>
> >> /**
> >> + * find_lock_page_retry - locate, pin and lock a pagecache page
> >> + * @mapping: the address_space to search
> >> + * @offset: the page index
> >> + * @vma: vma in which the fault was taken
> >> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> >> + * @retry: 1 indicate caller tolerate a retry.
> >> + *
> >> + * If retry flag is on, and page is already locked by someone else, return
> >> + * a hint of retry and leave *ppage untouched.
> >> + *
> >> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> >> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> >> + * hint to caller for retry, or ret=0 which means page is succefully
> >> + * locked.
> >> + */
> >> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> >> + struct vm_area_struct *vma, struct page **ppage,
> >> + int retry)
> >> +{
> >> + unsigned int ret = 0;
> >> + struct page *page;
> >> +
> >> +repeat:
> >> + page = find_get_page(mapping, offset);
> >> + if (page) {
> >> + if (!retry)
> >> + lock_page(page);
> >> + else {
> >> + if (!trylock_page(page)) {
> >> + struct mm_struct *mm = vma->vm_mm;
> >> +
> >> + up_read(&mm->mmap_sem);
> >> + wait_on_page_locked(page);
> >> + down_read(&mm->mmap_sem);
> >> +
> >> + page_cache_release(page);
> >> + return VM_FAULT_RETRY;
> >> + }
> >> + }
> >> + if (unlikely(page->mapping != mapping)) {
> >> + unlock_page(page);
> >> + page_cache_release(page);
> >> + goto repeat;
> >> + }
> >> + VM_BUG_ON(page->index != offset);
> >> + }
> >> + *ppage = page;
> >> + return ret;
> >> +}
> >> +EXPORT_SYMBOL(find_lock_page_retry);
> >> +
> >> +/**
> >> * find_or_create_page - locate or add a pagecache page
> >> * @mapping: the page's address_space
> >> * @index: the page's index into the mapping
> >> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >> pgoff_t size;
> >> int did_readaround = 0;
> >> int ret = 0;
> >> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> >> + int retry_ret;
> >>
> >> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> >> if (vmf->pgoff >= size)
> >> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> >> */
> >> retry_find:
> >> page = find_lock_page(mapping, vmf->pgoff);
> >> +
> >> /*
> >> * For sequential accesses, we use the generic readahead logic.
> >> */
> >> @@ -1465,7 +1520,13 @@ retry_find:
> >> if (!page) {
> >> page_cache_sync_readahead(mapping, ra, file,
> >> vmf->pgoff, 1);
> >> - page = find_lock_page(mapping, vmf->pgoff);
> >> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> + vma, &page, retry_flag);
> >> + if (retry_ret == VM_FAULT_RETRY) {
> >> + /* counteract the followed retry hit */
> >> + ra->mmap_miss++;
> >
> > Please don't relocate the comment...because that will break my
> > following patches(mainly Linus' filemap cleanups), which will
> > _heavily_ rework these chunks anyway. And make a hard time for Andrew
> > to merge them.
>
> then how you force the 80 character lines?

The above line will be removed by my following patch, and...

> >
> >> + return retry_ret;
> >> + }
> >
> >> if (!page)
> >> goto no_cached_page;
> >> }
> >> @@ -1504,7 +1565,14 @@ retry_find:
> >> start = vmf->pgoff - ra_pages / 2;
> >> do_page_cache_readahead(mapping, file, start, ra_pages);
> >> }
> >> - page = find_lock_page(mapping, vmf->pgoff);
> >> +retry_find_retry:
> >> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> >> + vma, &page, retry_flag);
> >> + if (retry_ret == VM_FAULT_RETRY) {
> >> + /* counteract the followed retry hit */
> >> + ra->mmap_miss++;
> >

...this line can have the comment within 80 column, and will be kept
by my patch.

Thanks,
Fengguang

> > ditto
> >
> > Thanks,
> > Fengguang
> >
> >> + return retry_ret;
> >> + }
> >> if (!page)
> >> goto no_cached_page;
> >> }
> >> @@ -1548,7 +1616,7 @@ no_cached_page:
> >> * meantime, we'll just come back here and read it again.
> >> */
> >> if (error >= 0)
> >> - goto retry_find;
> >> + goto retry_find_retry;
> >>
> >> /*
> >> * An error return from page_cache_read can result if the
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 164951c..5e215c9 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> >> vmf.page = NULL;
> >>
> >> ret = vma->vm_ops->fault(vma, &vmf);
> >> +
> >> + /* page may be available, but we have to restart the process
> >> + * because mmap_sem was dropped during the ->fault
> >> + */
> >> + if (ret & VM_FAULT_RETRY)
> >> + return ret;
> >> +
> >> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> >> return ret;
> >>
> >> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> >> {
> >> pgoff_t pgoff = (((address & PAGE_MASK)
> >> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> >> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> >> + int write = write_access & ~FAULT_FLAG_RETRY;
> >> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> >>
> >> + flags |= (write_access & FAULT_FLAG_RETRY);
> >> pte_unmap(page_table);
> >> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> >> }
> >> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> >> pud_t *pud;
> >> pmd_t *pmd;
> >> pte_t *pte;
> >> + int ret;
> >>
> >> __set_current_state(TASK_RUNNING);
> >>
> >> - count_vm_event(PGFAULT);
> >> -
> >> - if (unlikely(is_vm_hugetlb_page(vma)))
> >> - return hugetlb_fault(mm, vma, address, write_access);
> >> + if (unlikely(is_vm_hugetlb_page(vma))) {
> >> + ret = hugetlb_fault(mm, vma, address, write_access);
> >> + goto out;
> >> + }
> >>
> >> + ret = VM_FAULT_OOM;
> >> pgd = pgd_offset(mm, address);
> >> pud = pud_alloc(mm, pgd, address);
> >> if (!pud)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >> pmd = pmd_alloc(mm, pud, address);
> >> if (!pmd)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >> pte = pte_alloc_map(mm, pmd, address);
> >> if (!pte)
> >> - return VM_FAULT_OOM;
> >> + goto out;
> >>
> >> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> >> +out:
> >> + if (!(ret & VM_FAULT_RETRY))
> >> + count_vm_event(PGFAULT);
> >> + return ret;
> >> }
> >>
> >> #ifndef __PAGETABLE_PUD_FOLDED
> >

2009-04-10 06:10:32

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

Please give each patch in the series a unique and meaningful title.

On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <[email protected]> wrote:

> support for FAULT_FLAG_RETRY with no user change:

yup, we'd prefer a complete changelog here please.

> Signed-off-by: Ying Han <[email protected]>
> Mike Waychison <[email protected]>

This form:

Signed-off-by: Ying Han <[email protected]>
Signed-off-by: Mike Waychison <[email protected]>

is conventional.

> index 4a853ef..29c2c39 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -793,7 +793,7 @@ struct file_ra_state {
> there are only # of pages ahead */
>
> unsigned int ra_pages; /* Maximum readahead window */
> - int mmap_miss; /* Cache miss stat for mmap accesses */
> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */

This change makes sense, but we're not told the reasons for making it?
Did it fix a bug, or is it an unrelated fixlet, or...?

> loff_t prev_pos; /* Cache last read() position */
> };
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..5a134a9 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>
> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> +#define FAULT_FLAG_RETRY 0x04 /* Retry major fault */
>
>
> /*
> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>
> #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
>
> +#define VM_FAULT_RETRY 0x0010
> #define VM_FAULT_OOM 0x0001
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..6eb7c36 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,58 @@ repeat:
> EXPORT_SYMBOL(find_lock_page);
>
> /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page
> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @ppage: zero if page not present, otherwise point to the page in pagecache
> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * If retry flag is on, and page is already locked by someone else, return
> + * a hint of retry and leave *ppage untouched.
> + *
> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */

How about this:

If the page was not found in pagecache, find_lock_page_retry()
returns 0 and sets *@ppage to NULL.

If the page was found in pagecache but is locked and @retry is
true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
write to *@ppage.

If the page was found in pagecache and @retry is false,
find_lock_page_retry() locks the page, writes its address to *@ppage
and returns 0.

> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> + struct vm_area_struct *vma, struct page **ppage,
> + int retry)
> +{
> + unsigned int ret = 0;
> + struct page *page;
> +
> +repeat:
> + page = find_get_page(mapping, offset);
> + if (page) {
> + if (!retry)
> + lock_page(page);
> + else {
> + if (!trylock_page(page)) {
> + struct mm_struct *mm = vma->vm_mm;
> +
> + up_read(&mm->mmap_sem);
> + wait_on_page_locked(page);

It looks strange that we wait for the page lock and then don't
just lock the page and return it.

Adding a comment explaining _why_ we do this would be nice.

> + down_read(&mm->mmap_sem);
> +
> + page_cache_release(page);
> + return VM_FAULT_RETRY;
> + }
> + }
> + if (unlikely(page->mapping != mapping)) {
> + unlock_page(page);
> + page_cache_release(page);
> + goto repeat;
> + }
> + VM_BUG_ON(page->index != offset);
> + }
> + *ppage = page;
> + return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);
> +
> +/**
> * find_or_create_page - locate or add a pagecache page
> * @mapping: the page's address_space
> * @index: the page's index into the mapping
> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> pgoff_t size;
> int did_readaround = 0;
> int ret = 0;
> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> + int retry_ret;
>
> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> if (vmf->pgoff >= size)
> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> */
> retry_find:
> page = find_lock_page(mapping, vmf->pgoff);
> +
> /*
> * For sequential accesses, we use the generic readahead logic.
> */
> @@ -1465,7 +1520,13 @@ retry_find:
> if (!page) {
> page_cache_sync_readahead(mapping, ra, file,
> vmf->pgoff, 1);
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1504,7 +1565,14 @@ retry_find:
> start = vmf->pgoff - ra_pages / 2;
> do_page_cache_readahead(mapping, file, start, ra_pages);
> }
> - page = find_lock_page(mapping, vmf->pgoff);
> +retry_find_retry:
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (retry_ret == VM_FAULT_RETRY) {
> + /* counteract the followed retry hit */
> + ra->mmap_miss++;
> + return retry_ret;
> + }
> if (!page)
> goto no_cached_page;
> }
> @@ -1548,7 +1616,7 @@ no_cached_page:
> * meantime, we'll just come back here and read it again.
> */
> if (error >= 0)
> - goto retry_find;
> + goto retry_find_retry;
>
> /*
> * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..5e215c9 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> vmf.page = NULL;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> +
> + /* page may be available, but we have to restart the process
> + * because mmap_sem was dropped during the ->fault
> + */

The preferred comment layout is:

/*
* The page may be available, but we have to restart the process
* because mmap_sem was dropped during the ->fault
*/

> + if (ret & VM_FAULT_RETRY)
> + return ret;

Again, I worry that in some places we treat VM_FAULT_RETRY as an
enumerated type:

if (foo == VM_FAULT_RETRY)

but in other places we treat it as a bitfield

if (foo & VM_FAULT_RETRY)

it may not be buggy, but it _looks_ buggy. And confusing and
inconsistent and dangerous.

Can it be improved?

> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
>
> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> {
> pgoff_t pgoff = (((address & PAGE_MASK)
> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> + int write = write_access & ~FAULT_FLAG_RETRY;
> + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>
> + flags |= (write_access & FAULT_FLAG_RETRY);

gee, I'm lost.

Can we please redo this as:

int write;
unsigned int flags;

/*
* Big fat comment explaining the next three lines goes here
*/
write = write_access & ~FAULT_FLAG_RETRY;
unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
flags |= (write_access & FAULT_FLAG_RETRY);

?

> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }
> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
> + int ret;
>
> __set_current_state(TASK_RUNNING);
>
> - count_vm_event(PGFAULT);
> -
> - if (unlikely(is_vm_hugetlb_page(vma)))
> - return hugetlb_fault(mm, vma, address, write_access);
> + if (unlikely(is_vm_hugetlb_page(vma))) {
> + ret = hugetlb_fault(mm, vma, address, write_access);
> + goto out;
> + }
>
> + ret = VM_FAULT_OOM;
> pgd = pgd_offset(mm, address);
> pud = pud_alloc(mm, pgd, address);
> if (!pud)
> - return VM_FAULT_OOM;
> + goto out;
> pmd = pmd_alloc(mm, pud, address);
> if (!pmd)
> - return VM_FAULT_OOM;
> + goto out;
> pte = pte_alloc_map(mm, pmd, address);
> if (!pte)
> - return VM_FAULT_OOM;
> + goto out;
>
> - return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> + ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
> +out:
> + if (!(ret & VM_FAULT_RETRY))
> + count_vm_event(PGFAULT);
> + return ret;
> }

2009-04-10 06:32:22

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

2009/4/9 Andrew Morton <[email protected]>:
>
>> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
>
> Please give each patch in the series a unique and meaningful title.
>
> On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <[email protected]> wrote:
>
>> support for FAULT_FLAG_RETRY with no user change:
>
> yup, we'd prefer a complete changelog here please.
>
>> Signed-off-by: Ying Han <[email protected]>
>> ? ? ? ? ? ? ?Mike Waychison <[email protected]>
>
> This form:
>
> Signed-off-by: Ying Han <[email protected]>
> Signed-off-by: Mike Waychison <[email protected]>

Thanks Andrew, and i need to add Fengguang to Signed-off-by.

>
> is conventional.
>
>> index 4a853ef..29c2c39 100644
>> --- a/include/linux/fs.h
>> +++ b/include/linux/fs.h
>> @@ -793,7 +793,7 @@ struct file_ra_state {
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?there are only # of pages ahead */
>>
>> ? ? ? unsigned int ra_pages; ? ? ? ? ?/* Maximum readahead window */
>> - ? ? int mmap_miss; ? ? ? ? ? ? ? ? ?/* Cache miss stat for mmap accesses */
>> + ? ? unsigned int mmap_miss; ? ? ? ? /* Cache miss stat for mmap accesses */
>
> This change makes sense, but we're not told the reasons for making it?
> Did it fix a bug, or is it an unrelated fixlet, or...?

Fengguang: Could you help making comments on this part? and i will
make changes elsewhere
as Andrew pointed. Thanks

>
>> ? ? ? loff_t prev_pos; ? ? ? ? ? ? ? ?/* Cache last read() position */
>> ?};
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index ffee2f7..5a134a9 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -144,6 +144,7 @@ extern pgprot_t protection_map[16];
>>
>> ?#define FAULT_FLAG_WRITE ? ? 0x01 ? ?/* Fault was a write access */
>> ?#define FAULT_FLAG_NONLINEAR 0x02 ? ?/* Fault was via a nonlinear mapping */
>> +#define FAULT_FLAG_RETRY ? ? 0x04 ? ?/* Retry major fault */
>>
>>
>> ?/*
>> @@ -690,6 +691,7 @@ static inline int page_mapped(struct page *page)
>>
>> ?#define VM_FAULT_MINOR ? ? ? 0 /* For backwards compat. Remove me quickly. */
>>
>> +#define VM_FAULT_RETRY ? ? ? 0x0010
>> ?#define VM_FAULT_OOM 0x0001
>> ?#define VM_FAULT_SIGBUS ? ? ?0x0002
>> ?#define VM_FAULT_MAJOR ? ? ? 0x0004
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f3e5f89..6eb7c36 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -714,6 +714,58 @@ repeat:
>> ?EXPORT_SYMBOL(find_lock_page);
>>
>> ?/**
>> + * find_lock_page_retry - locate, pin and lock a pagecache page
>> + * @mapping: the address_space to search
>> + * @offset: the page index
>> + * @vma: vma in which the fault was taken
>> + * @ppage: zero if page not present, otherwise point to the page in pagecache
>> + * @retry: 1 indicate caller tolerate a retry.
>> + *
>> + * If retry flag is on, and page is already locked by someone else, return
>> + * a hint of retry and leave *ppage untouched.
>> + *
>> + * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
>> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
>> + * hint to caller for retry, or ret=0 which means page is succefully
>> + * locked.
>> + */
>
> How about this:
>
> ?If the page was not found in pagecache, find_lock_page_retry()
> ?returns 0 and sets *@ppage to NULL.
>
> ?If the page was found in pagecache but is locked and @retry is
> ?true, find_lock_page_retry() returns VM_FAULT_RETRY and does not
> ?write to *@ppage.
>
> ?If the page was found in pagecache and @retry is false,
> ?find_lock_page_retry() locks the page, writes its address to *@ppage
> ?and returns 0.
>
>> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct vm_area_struct *vma, struct page **ppage,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? int retry)
>> +{
>> + ? ? unsigned int ret = 0;
>> + ? ? struct page *page;
>> +
>> +repeat:
>> + ? ? page = find_get_page(mapping, offset);
>> + ? ? if (page) {
>> + ? ? ? ? ? ? if (!retry)
>> + ? ? ? ? ? ? ? ? ? ? lock_page(page);
>> + ? ? ? ? ? ? else {
>> + ? ? ? ? ? ? ? ? ? ? if (!trylock_page(page)) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct mm_struct *mm = vma->vm_mm;
>> +
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? up_read(&mm->mmap_sem);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? wait_on_page_locked(page);
>
> It looks strange that we wait for the page lock and then don't
> just lock the page and return it.
>
> Adding a comment explaining _why_ we do this would be nice.
>
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? down_read(&mm->mmap_sem);
>> +
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? page_cache_release(page);
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? return VM_FAULT_RETRY;
>> + ? ? ? ? ? ? ? ? ? ? }
>> + ? ? ? ? ? ? }
>> + ? ? ? ? ? ? if (unlikely(page->mapping != mapping)) {
>> + ? ? ? ? ? ? ? ? ? ? unlock_page(page);
>> + ? ? ? ? ? ? ? ? ? ? page_cache_release(page);
>> + ? ? ? ? ? ? ? ? ? ? goto repeat;
>> + ? ? ? ? ? ? }
>> + ? ? ? ? ? ? VM_BUG_ON(page->index != offset);
>> + ? ? }
>> + ? ? *ppage = page;
>> + ? ? return ret;
>> +}
>> +EXPORT_SYMBOL(find_lock_page_retry);
>> +
>> +/**
>> ? * find_or_create_page - locate or add a pagecache page
>> ? * @mapping: the page's address_space
>> ? * @index: the page's index into the mapping
>> @@ -1444,6 +1496,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> ? ? ? pgoff_t size;
>> ? ? ? int did_readaround = 0;
>> ? ? ? int ret = 0;
>> + ? ? int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
>> + ? ? int retry_ret;
>>
>> ? ? ? size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
>> ? ? ? if (vmf->pgoff >= size)
>> @@ -1458,6 +1512,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
>> ? ? ? ?*/
>> ?retry_find:
>> ? ? ? page = find_lock_page(mapping, vmf->pgoff);
>> +
>> ? ? ? /*
>> ? ? ? ?* For sequential accesses, we use the generic readahead logic.
>> ? ? ? ?*/
>> @@ -1465,7 +1520,13 @@ retry_find:
>> ? ? ? ? ? ? ? if (!page) {
>> ? ? ? ? ? ? ? ? ? ? ? page_cache_sync_readahead(mapping, ra, file,
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?vmf->pgoff, 1);
>> - ? ? ? ? ? ? ? ? ? ? page = find_lock_page(mapping, vmf->pgoff);
>> + ? ? ? ? ? ? ? ? ? ? retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? vma, &page, retry_flag);
>> + ? ? ? ? ? ? ? ? ? ? if (retry_ret == VM_FAULT_RETRY) {
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? /* counteract the followed retry hit */
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ra->mmap_miss++;
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? return retry_ret;
>> + ? ? ? ? ? ? ? ? ? ? }
>> ? ? ? ? ? ? ? ? ? ? ? if (!page)
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? goto no_cached_page;
>> ? ? ? ? ? ? ? }
>> @@ -1504,7 +1565,14 @@ retry_find:
>> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? start = vmf->pgoff - ra_pages / 2;
>> ? ? ? ? ? ? ? ? ? ? ? do_page_cache_readahead(mapping, file, start, ra_pages);
>> ? ? ? ? ? ? ? }
>> - ? ? ? ? ? ? page = find_lock_page(mapping, vmf->pgoff);
>> +retry_find_retry:
>> + ? ? ? ? ? ? retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
>> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? vma, &page, retry_flag);
>> + ? ? ? ? ? ? if (retry_ret == VM_FAULT_RETRY) {
>> + ? ? ? ? ? ? ? ? ? ? /* counteract the followed retry hit */
>> + ? ? ? ? ? ? ? ? ? ? ra->mmap_miss++;
>> + ? ? ? ? ? ? ? ? ? ? return retry_ret;
>> + ? ? ? ? ? ? }
>> ? ? ? ? ? ? ? if (!page)
>> ? ? ? ? ? ? ? ? ? ? ? goto no_cached_page;
>> ? ? ? }
>> @@ -1548,7 +1616,7 @@ no_cached_page:
>> ? ? ? ?* meantime, we'll just come back here and read it again.
>> ? ? ? ?*/
>> ? ? ? if (error >= 0)
>> - ? ? ? ? ? ? goto retry_find;
>> + ? ? ? ? ? ? goto retry_find_retry;
>>
>> ? ? ? /*
>> ? ? ? ?* An error return from page_cache_read can result if the
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 164951c..5e215c9 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
>> ? ? ? vmf.page = NULL;
>>
>> ? ? ? ret = vma->vm_ops->fault(vma, &vmf);
>> +
>> + ? ? /* page may be available, but we have to restart the process
>> + ? ? ?* because mmap_sem was dropped during the ->fault
>> + ? ? ?*/
>
> The preferred comment layout is:
>
> ? ? ? ?/*
> ? ? ? ? * The page may be available, but we have to restart the process
> ? ? ? ? * because mmap_sem was dropped during the ->fault
> ? ? ? ? */
>
>> + ? ? if (ret & VM_FAULT_RETRY)
>> + ? ? ? ? ? ? return ret;
>
> Again, I worry that in some places we treat VM_FAULT_RETRY as an
> enumerated type:
>
> ? ? ? ?if (foo == VM_FAULT_RETRY)
>
> but in other places we treat it as a bitfield
>
> ? ? ? ?if (foo & VM_FAULT_RETRY)
>
> it may not be buggy, but it _looks_ buggy. ?And confusing and
> inconsistent and dangerous.
>
> Can it be improved?
>
>> ? ? ? if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
>> ? ? ? ? ? ? ? return ret;
>>
>> @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
>> ?{
>> ? ? ? pgoff_t pgoff = (((address & PAGE_MASK)
>> ? ? ? ? ? ? ? ? ? ? ? - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>> - ? ? unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>> + ? ? int write = write_access & ~FAULT_FLAG_RETRY;
>> + ? ? unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>>
>> + ? ? flags |= (write_access & FAULT_FLAG_RETRY);
>
> gee, I'm lost.
>
> Can we please redo this as:
>
>
> ? ? ? ?int write;
> ? ? ? ?unsigned int flags;
>
> ? ? ? ?/*
> ? ? ? ? * Big fat comment explaining the next three lines goes here
> ? ? ? ? */
> ? ? ? ?write = write_access & ~FAULT_FLAG_RETRY;
> ? ? ? ?unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> ? ? ? ?flags |= (write_access & FAULT_FLAG_RETRY);
>
> ?
>
>> ? ? ? pte_unmap(page_table);
>> ? ? ? return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
>> ?}
>> @@ -2726,26 +2735,32 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_ar
>> ? ? ? pud_t *pud;
>> ? ? ? pmd_t *pmd;
>> ? ? ? pte_t *pte;
>> + ? ? int ret;
>>
>> ? ? ? __set_current_state(TASK_RUNNING);
>>
>> - ? ? count_vm_event(PGFAULT);
>> -
>> - ? ? if (unlikely(is_vm_hugetlb_page(vma)))
>> - ? ? ? ? ? ? return hugetlb_fault(mm, vma, address, write_access);
>> + ? ? if (unlikely(is_vm_hugetlb_page(vma))) {
>> + ? ? ? ? ? ? ret = hugetlb_fault(mm, vma, address, write_access);
>> + ? ? ? ? ? ? goto out;
>> + ? ? }
>>
>> + ? ? ret = VM_FAULT_OOM;
>> ? ? ? pgd = pgd_offset(mm, address);
>> ? ? ? pud = pud_alloc(mm, pgd, address);
>> ? ? ? if (!pud)
>> - ? ? ? ? ? ? return VM_FAULT_OOM;
>> + ? ? ? ? ? ? goto out;
>> ? ? ? pmd = pmd_alloc(mm, pud, address);
>> ? ? ? if (!pmd)
>> - ? ? ? ? ? ? return VM_FAULT_OOM;
>> + ? ? ? ? ? ? goto out;
>> ? ? ? pte = pte_alloc_map(mm, pmd, address);
>> ? ? ? if (!pte)
>> - ? ? ? ? ? ? return VM_FAULT_OOM;
>> + ? ? ? ? ? ? goto out;
>>
>> - ? ? return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> + ? ? ret = handle_pte_fault(mm, vma, address, pte, pmd, write_access);
>> +out:
>> + ? ? if (!(ret & VM_FAULT_RETRY))
>> + ? ? ? ? ? ? count_vm_event(PGFAULT);
>> + ? ? return ret;
>> ?}
>
>

2009-04-10 06:49:21

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Fri, Apr 10, 2009 at 02:32:01PM +0800, Ying Han wrote:
> 2009/4/9 Andrew Morton <[email protected]>:
> >
> >> Subject: [PATCH][1/2]page_fault retry with NOPAGE_RETRY
> >
> > Please give each patch in the series a unique and meaningful title.
> >
> > On Wed, 8 Apr 2009 13:02:35 -0700 Ying Han <[email protected]> wrote:
> >
> >> support for FAULT_FLAG_RETRY with no user change:
> >
> > yup, we'd prefer a complete changelog here please.
> >
> >> Signed-off-by: Ying Han <[email protected]>
> >> Mike Waychison <[email protected]>
> >
> > This form:
> >
> > Signed-off-by: Ying Han <[email protected]>
> > Signed-off-by: Mike Waychison <[email protected]>
>
> Thanks Andrew, and i need to add Fengguang to Signed-off-by.

Thank you.

> >
> > is conventional.
> >
> >> index 4a853ef..29c2c39 100644
> >> --- a/include/linux/fs.h
> >> +++ b/include/linux/fs.h
> >> @@ -793,7 +793,7 @@ struct file_ra_state {
> >> there are only # of pages ahead */
> >>
> >> unsigned int ra_pages; /* Maximum readahead window */
> >> - int mmap_miss; /* Cache miss stat for mmap accesses */
> >> + unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
> >
> > This change makes sense, but we're not told the reasons for making it?
> > Did it fix a bug, or is it an unrelated fixlet, or...?
>
> Fengguang: Could you help making comments on this part? and i will
> make changes elsewhere as Andrew pointed. Thanks

Ah this may deserve a standalone patch:
---
readhead: make mmap_miss an unsigned int

This makes the performance impact of possible mmap_miss wrap around to be
temporary and tolerable: i.e. MMAP_LOTSAMISS=100 extra readarounds.

Otherwise if ever mmap_miss wraps around to negative, it takes INT_MAX
cache misses to bring it back to normal state. During the time mmap
readaround will be _enabled_ for whatever wild random workload. That's
almost permanent performance impact.

Signed-off-by: Wu Fengguang <[email protected]>
---
include/linux/fs.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- mm.orig/include/linux/fs.h
+++ mm/include/linux/fs.h
@@ -824,7 +824,7 @@ struct file_ra_state {
there are only # of pages ahead */

unsigned int ra_pages; /* Maximum readahead window */
- int mmap_miss; /* Cache miss stat for mmap accesses */
+ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos; /* Cache last read() position */
};

2009-04-10 07:36:02

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
[snip]
> > if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> > return ret;
> >
> > @@ -2611,8 +2618,10 @@ static int do_linear_fault(struct mm_struct *mm, struct
> > {
> > pgoff_t pgoff = (((address & PAGE_MASK)
> > - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> > - unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
> > + int write = write_access & ~FAULT_FLAG_RETRY;
> > + unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
> >
> > + flags |= (write_access & FAULT_FLAG_RETRY);
>
> gee, I'm lost.

So did me.

> Can we please redo this as:
>
>
> int write;
> unsigned int flags;
>
> /*
> * Big fat comment explaining the next three lines goes here
> */

Basically it's doing a
(is_write_access | FAULT_FLAG_RETRY) =>
(FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
by extracting the bool part:
> write = write_access & ~FAULT_FLAG_RETRY;
convert bool to a bit flag:
> unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
and restore the FAULT_FLAG_RETRY:
> flags |= (write_access & FAULT_FLAG_RETRY);

Thanks,
Fengguang

2009-04-10 16:10:18

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

On Fri, 10 Apr 2009, Wu Fengguang wrote:

> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
>
> > Can we please redo this as:
> >
> >
> > int write;
> > unsigned int flags;
> >
> > /*
> > * Big fat comment explaining the next three lines goes here
> > */
>
> Basically it's doing a
> (is_write_access | FAULT_FLAG_RETRY) =>
> (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
> by extracting the bool part:
> > write = write_access & ~FAULT_FLAG_RETRY;
> convert bool to a bit flag:
> > unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

The point is, we shouldn't do that.

Your code is confused, because it uses "write_access" as if it had the old
behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say
"retry"), and that's just wrong.

Just get rid of "write_access" entirely, and switch it over to something
that is a pure bitmask.

Yes, it means a couple of new preliminary patches that switch all callers
of handle_mm_fault() over to using the VM_FLAGS, but that's not a big
deal.

I'm following up this email with two _example_ patches. They are untested,
but they look sane. I'd like the series to _start_ with these, and then
you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to
handle_mm_fault() cleanly.

Hmm? Note the _untested_ part on the patches to follow. It was done very
mechanically, and the patches look sane, but .. !!!

Linus

2009-04-10 16:13:17

[permalink] [raw]

Subject: [PATCH 1/2] Remove internal use of 'write_access' in mm/memory.c

From: Linus Torvalds <[email protected]>
Date: Fri, 10 Apr 2009 08:43:11 -0700

The fault handling routines really want more fine-grained flags than a
single "was it a write fault" boolean - the callers will want to set
flags like "you can return a retry error" etc.

And that's actually how the VM works internally, but right now the
top-level fault handling functions in mm/memory.c all pass just the
'write_access' boolean around.

This switches them over to pass around the FAULT_FLAG_xyzzy 'flags'
variable instead. The 'write_access' calling convention still exists
for the exported 'handle_mm_fault()' function, but that is next.

Signed-off-by: Linus Torvalds <[email protected]>
---
mm/memory.c | 42 +++++++++++++++++++++---------------------
1 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index cf6873e..9050bae 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2413,7 +2413,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
*/
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
struct page *page;
@@ -2490,9 +2490,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,

inc_mm_counter(mm, anon_rss);
pte = mk_pte(page, vma->vm_page_prot);
- if (write_access && reuse_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- write_access = 0;
+ flags &= ~FAULT_FLAG_WRITE;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
@@ -2505,7 +2505,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
try_to_free_swap(page);
unlock_page(page);

- if (write_access) {
+ if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
@@ -2533,7 +2533,7 @@ out_nomap:
*/
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access)
+ unsigned int flags)
{
struct page *page;
spinlock_t *ptl;
@@ -2698,7 +2698,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
- * Note that if write_access is true, we either now have
+ * Note that if FAULT_FLAG_WRITE is set, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
@@ -2753,11 +2753,10 @@ out_unlocked:

static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);

pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
@@ -2774,12 +2773,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
- int write_access, pte_t orig_pte)
+ unsigned int flags, pte_t orig_pte)
{
- unsigned int flags = FAULT_FLAG_NONLINEAR |
- (write_access ? FAULT_FLAG_WRITE : 0);
pgoff_t pgoff;

+ flags |= FAULT_FLAG_NONLINEAR;
+
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return 0;

@@ -2810,7 +2809,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, int write_access)
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
@@ -2821,30 +2820,30 @@ static inline int handle_pte_fault(struct mm_struct *mm,
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
- pte, pmd, write_access, entry);
+ pte, pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
- pte, pmd, write_access);
+ pte, pmd, flags);
}
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
- pte, pmd, write_access, entry);
+ pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
- pte, pmd, write_access, entry);
+ pte, pmd, flags, entry);
}

ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
- if (write_access) {
+ if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
+ if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, entry);
} else {
/*
@@ -2853,7 +2852,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (write_access)
+ if (flags & FAULT_FLAG_WRITE)
flush_tlb_page(vma, address);
}
unlock:
@@ -2871,13 +2870,14 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;

__set_current_state(TASK_RUNNING);

count_vm_event(PGFAULT);

if (unlikely(is_vm_hugetlb_page(vma)))
- return hugetlb_fault(mm, vma, address, write_access);
+ return hugetlb_fault(mm, vma, address, flags);

pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
@@ -2890,7 +2890,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte)
return VM_FAULT_OOM;

- return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}

#ifndef __PAGETABLE_PUD_FOLDED
--
1.6.2.2.471.g6da14.dirty

2009-04-10 16:18:10

[permalink] [raw]

Subject: [PATCH 2/2] Move FAULT_FLAG_xyz into handle_mm_fault() callers

From: Linus Torvalds <[email protected]>
Date: Fri, 10 Apr 2009 09:01:23 -0700

This allows the callers to now pass down the full set of FAULT_FLAG_xyz
flags to handle_mm_fault(). All callers have been (mechanically)
converted to the new calling convention, there's almost certainly room
for architectures to clean up their code and then add FAULT_FLAG_RETRY
when that support is added.

Signed-off-by: Linus Torvalds <[email protected]>
---

Again - untested. Not compiled. Might well rape your pets and just
otherwise act badly. It's also a very mechanical conversion, ie I
explicitly did

- fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
+ fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0);

rather than doing some cleanup while there.

The point is, once we do this, now you really pass FAULT_FLAG_xyz
everywhere, and now it's trivial to add FAULT_FLAG_RETRY without horribly
ugly or hacky code.

For example, before, I think Wu's code would have failed on ARM if
FAULT_FLAG_RETRY just happened to be (1 << 11), because back when we
passed in "zero or non-zero", ARM would literally pass in (1 << 11) or 0
for "write_access".

Now we explicitly pass in a nice FAULT_FLAG_WRITE or 0.

arch/alpha/mm/fault.c | 2 +-
arch/arm/mm/fault.c | 2 +-
arch/avr32/mm/fault.c | 2 +-
arch/cris/mm/fault.c | 2 +-
arch/frv/mm/fault.c | 2 +-
arch/ia64/mm/fault.c | 2 +-
arch/m32r/mm/fault.c | 2 +-
arch/m68k/mm/fault.c | 2 +-
arch/mips/mm/fault.c | 2 +-
arch/mn10300/mm/fault.c | 2 +-
arch/parisc/mm/fault.c | 2 +-
arch/powerpc/mm/fault.c | 2 +-
arch/powerpc/platforms/cell/spu_fault.c | 2 +-
arch/s390/lib/uaccess_pt.c | 2 +-
arch/s390/mm/fault.c | 2 +-
arch/sh/mm/fault_32.c | 2 +-
arch/sh/mm/tlbflush_64.c | 2 +-
arch/sparc/mm/fault_32.c | 4 ++--
arch/sparc/mm/fault_64.c | 2 +-
arch/um/kernel/trap.c | 2 +-
arch/x86/mm/fault.c | 2 +-
arch/xtensa/mm/fault.c | 2 +-
include/linux/mm.h | 4 ++--
mm/memory.c | 8 ++++----
24 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 4829f96..00a31de 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -146,7 +146,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
/* If for any reason at all we couldn't handle the fault,
make sure we exit gracefully rather than endlessly redo
the fault. */
- fault = handle_mm_fault(mm, vma, address, cause > 0);
+ fault = handle_mm_fault(mm, vma, address, cause > 0 ? FAULT_FLAG_WRITE : 0);
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0455557..6fdcbb7 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -208,7 +208,7 @@ good_area:
* than endlessly redo the fault.
*/
survive:
- fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
+ fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 62d4abb..b61d86d 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -133,7 +133,7 @@ good_area:
* fault.
*/
survive:
- fault = handle_mm_fault(mm, vma, address, writeaccess);
+ fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index c4c76db..f925115 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -163,7 +163,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
* the fault.
*/

- fault = handle_mm_fault(mm, vma, address, writeaccess & 1);
+ fault = handle_mm_fault(mm, vma, address, (writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 05093d4..30f5d10 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -163,7 +163,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, ear0, write);
+ fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 23088be..19261a9 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -154,7 +154,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
* sure we exit gracefully rather than endlessly redo the
* fault.
*/
- fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0);
+ fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We ran out of memory, or some other thing happened
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 4a71df4..7274b47 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -196,7 +196,7 @@ survive:
*/
addr = (address & PAGE_MASK);
set_thread_fault_code(error_code);
- fault = handle_mm_fault(mm, vma, addr, write);
+ fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index f493f03..d0e35cf 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -155,7 +155,7 @@ good_area:
*/

survive:
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
#ifdef DEBUG
printk("handle_mm_fault returns %d\n",fault);
#endif
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 55767ad..6751ce9 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -102,7 +102,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 33cf250..a62e1e1 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -258,7 +258,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 92c7fa4..bfb6dd6 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -202,7 +202,7 @@ good_area:
* fault.
*/

- fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0);
+ fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
/*
* We hit a shared mapping outside of the file, or some
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 7699394..e2bf1ee 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -299,7 +299,7 @@ good_area:
* the fault.
*/
survive:
- ret = handle_mm_fault(mm, vma, address, is_write);
+ ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(ret & VM_FAULT_ERROR)) {
if (ret & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c
index 95d8dad..d06ba87 100644
--- a/arch/powerpc/platforms/cell/spu_fault.c
+++ b/arch/powerpc/platforms/cell/spu_fault.c
@@ -70,7 +70,7 @@ int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
}

ret = 0;
- *flt = handle_mm_fault(mm, vma, ea, is_write);
+ *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(*flt & VM_FAULT_ERROR)) {
if (*flt & VM_FAULT_OOM) {
ret = -ENOMEM;
diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
index b0b84c3..cb5d59e 100644
--- a/arch/s390/lib/uaccess_pt.c
+++ b/arch/s390/lib/uaccess_pt.c
@@ -66,7 +66,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
}

survive:
- fault = handle_mm_fault(mm, vma, address, write_access);
+ fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 833e836..31456fa 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -351,7 +351,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM) {
up_read(&mm->mmap_sem);
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index 31a33eb..09ef52a 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -133,7 +133,7 @@ good_area:
* the fault.
*/
survive:
- fault = handle_mm_fault(mm, vma, address, writeaccess);
+ fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 7876997..fcbb6e1 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -187,7 +187,7 @@ good_area:
* the fault.
*/
survive:
- fault = handle_mm_fault(mm, vma, address, writeaccess);
+ fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 12e447f..a5e30c6 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -241,7 +241,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
@@ -484,7 +484,7 @@ good_area:
if(!(vma->vm_flags & (VM_READ | VM_EXEC)))
goto bad_area;
}
- switch (handle_mm_fault(mm, vma, address, write)) {
+ switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
case VM_FAULT_SIGBUS:
case VM_FAULT_OOM:
goto do_sigbus;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 4ab8993..e5620b2 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -398,7 +398,7 @@ good_area:
goto bad_area;
}

- fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE));
+ fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 7384d8a..637c650 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -65,7 +65,7 @@ good_area:
do {
int fault;

- fault = handle_mm_fault(mm, vma, address, is_write);
+ fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM) {
goto out_of_memory;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b727..65a07ba 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1130,7 +1130,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);

if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index bdd860d..bc07333 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -106,7 +106,7 @@ good_area:
* the fault.
*/
survive:
- fault = handle_mm_fault(mm, vma, address, is_write);
+ fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..3f207d1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -810,11 +810,11 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);

#ifdef CONFIG_MMU
extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access);
+ unsigned long address, unsigned int flags);
#else
static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
- int write_access)
+ unsigned int flags)
{
/* should never happen if there's no MMU */
BUG();
diff --git a/mm/memory.c b/mm/memory.c
index 9050bae..383dc0b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1310,8 +1310,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
- ret = handle_mm_fault(mm, vma, start,
- foll_flags & FOLL_WRITE);
+
+ /* FOLL_WRITE matches FAULT_FLAG_WRITE! */
+ ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
@@ -2864,13 +2865,12 @@ unlock:
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, int write_access)
+ unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;

__set_current_state(TASK_RUNNING);

--
1.6.2.2.471.g6da14.dirty

2009-04-10 17:57:31

[permalink] [raw]

Subject: Re: [PATCH][1/2]page_fault retry with NOPAGE_RETRY

2009/4/10 Linus Torvalds <[email protected]>:
>
>
> On Fri, 10 Apr 2009, Wu Fengguang wrote:
>
>> On Fri, Apr 10, 2009 at 02:02:05PM +0800, Andrew Morton wrote:
>>
>> > Can we please redo this as:
>> >
>> >
>> > int write;
>> > unsigned int flags;
>> >
>> > /*
>> > * Big fat comment explaining the next three lines goes here
>> > */
>>
>> Basically it's doing a
>> (is_write_access | FAULT_FLAG_RETRY) =>
>> (FAULT_FLAG_WRITE | FAULT_FLAG_RETRY)
>> by extracting the bool part:
>> > write = write_access & ~FAULT_FLAG_RETRY;
>> convert bool to a bit flag:
>> > unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);

Fengguang: thank you for your comments. i agree this is confusing...
>
> The point is, we shouldn't do that.
>
> Your code is confused, because it uses "write_access" as if it had the old
> behaviour (boolean to say "write") _plus_ the new behavior (bitmask to say
> "retry"), and that's just wrong.
>
> Just get rid of "write_access" entirely, and switch it over to something
> that is a pure bitmask.
>
> Yes, it means a couple of new preliminary patches that switch all callers
> of handle_mm_fault() over to using the VM_FLAGS, but that's not a big
> deal.
>
> I'm following up this email with two _example_ patches. They are untested,
> but they look sane. I'd like the series to _start_ with these, and then
> you can pass FAULT_FLAGS_WRITE | FAULT_FLAGS_RETRY down to
> handle_mm_fault() cleanly.
>
> Hmm? Note the _untested_ part on the patches to follow. It was done very
> mechanically, and the patches look sane, but .. !!!

Thanks Linus for your comments. I will take Peter Zijlstra's patches
(the _untested_ part)
which basically replaces the write_access as a flag as you mentioned
and start from there.

My next step is to cleanup the patch with comments in the thread so
far and post the new version.
Anything else i missed, please let me know. thanks

--Ying
>
> Linus
>

2009-04-10 19:15:58

[permalink] [raw]

Subject: Re: [PATCH 2/2] Move FAULT_FLAG_xyz into handle_mm_fault() callers

2009/4/10 Linus Torvalds <[email protected]>:
>
> From: Linus Torvalds <[email protected]>
> Date: Fri, 10 Apr 2009 09:01:23 -0700
>
> This allows the callers to now pass down the full set of FAULT_FLAG_xyz
> flags to handle_mm_fault(). All callers have been (mechanically)
> converted to the new calling convention, there's almost certainly room
> for architectures to clean up their code and then add FAULT_FLAG_RETRY
> when that support is added.
>
> Signed-off-by: Linus Torvalds <[email protected]>
> ---
>
> Again - untested. Not compiled. Might well rape your pets and just
> otherwise act badly. It's also a very mechanical conversion, ie I
> explicitly did
>
> - fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
> + fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0);
>
> rather than doing some cleanup while there.
>
> The point is, once we do this, now you really pass FAULT_FLAG_xyz
> everywhere, and now it's trivial to add FAULT_FLAG_RETRY without horribly
> ugly or hacky code.
>
> For example, before, I think Wu's code would have failed on ARM if
> FAULT_FLAG_RETRY just happened to be (1 << 11), because back when we
> passed in "zero or non-zero", ARM would literally pass in (1 << 11) or 0
> for "write_access".
>
> Now we explicitly pass in a nice FAULT_FLAG_WRITE or 0.
>
> arch/alpha/mm/fault.c | 2 +-
> arch/arm/mm/fault.c | 2 +-
> arch/avr32/mm/fault.c | 2 +-
> arch/cris/mm/fault.c | 2 +-
> arch/frv/mm/fault.c | 2 +-
> arch/ia64/mm/fault.c | 2 +-
> arch/m32r/mm/fault.c | 2 +-
> arch/m68k/mm/fault.c | 2 +-
> arch/mips/mm/fault.c | 2 +-
> arch/mn10300/mm/fault.c | 2 +-
> arch/parisc/mm/fault.c | 2 +-
> arch/powerpc/mm/fault.c | 2 +-
> arch/powerpc/platforms/cell/spu_fault.c | 2 +-
> arch/s390/lib/uaccess_pt.c | 2 +-
> arch/s390/mm/fault.c | 2 +-
> arch/sh/mm/fault_32.c | 2 +-
> arch/sh/mm/tlbflush_64.c | 2 +-
> arch/sparc/mm/fault_32.c | 4 ++--
> arch/sparc/mm/fault_64.c | 2 +-
> arch/um/kernel/trap.c | 2 +-
> arch/x86/mm/fault.c | 2 +-
> arch/xtensa/mm/fault.c | 2 +-
> include/linux/mm.h | 4 ++--
> mm/memory.c | 8 ++++----
> 24 files changed, 29 insertions(+), 29 deletions(-)
>
> diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
> index 4829f96..00a31de 100644
> --- a/arch/alpha/mm/fault.c
> +++ b/arch/alpha/mm/fault.c
> @@ -146,7 +146,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
> /* If for any reason at all we couldn't handle the fault,
> make sure we exit gracefully rather than endlessly redo
> the fault. */
> - fault = handle_mm_fault(mm, vma, address, cause > 0);
> + fault = handle_mm_fault(mm, vma, address, cause > 0 ? FAULT_FLAG_WRITE : 0);
> up_read(&mm->mmap_sem);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
> index 0455557..6fdcbb7 100644
> --- a/arch/arm/mm/fault.c
> +++ b/arch/arm/mm/fault.c
> @@ -208,7 +208,7 @@ good_area:
> * than endlessly redo the fault.
> */
> survive:
> - fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, fsr & (1 << 11));
> + fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, (fsr & (1 << 11)) ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
> index 62d4abb..b61d86d 100644
> --- a/arch/avr32/mm/fault.c
> +++ b/arch/avr32/mm/fault.c
> @@ -133,7 +133,7 @@ good_area:
> * fault.
> */
> survive:
> - fault = handle_mm_fault(mm, vma, address, writeaccess);
> + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
> index c4c76db..f925115 100644
> --- a/arch/cris/mm/fault.c
> +++ b/arch/cris/mm/fault.c
> @@ -163,7 +163,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
> * the fault.
> */
>
> - fault = handle_mm_fault(mm, vma, address, writeaccess & 1);
> + fault = handle_mm_fault(mm, vma, address, (writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
> index 05093d4..30f5d10 100644
> --- a/arch/frv/mm/fault.c
> +++ b/arch/frv/mm/fault.c
> @@ -163,7 +163,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, ear0, write);
> + fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
> index 23088be..19261a9 100644
> --- a/arch/ia64/mm/fault.c
> +++ b/arch/ia64/mm/fault.c
> @@ -154,7 +154,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
> * sure we exit gracefully rather than endlessly redo the
> * fault.
> */
> - fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) != 0);
> + fault = handle_mm_fault(mm, vma, address, (mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> /*
> * We ran out of memory, or some other thing happened
> diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
> index 4a71df4..7274b47 100644
> --- a/arch/m32r/mm/fault.c
> +++ b/arch/m32r/mm/fault.c
> @@ -196,7 +196,7 @@ survive:
> */
> addr = (address & PAGE_MASK);
> set_thread_fault_code(error_code);
> - fault = handle_mm_fault(mm, vma, addr, write);
> + fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
> index f493f03..d0e35cf 100644
> --- a/arch/m68k/mm/fault.c
> +++ b/arch/m68k/mm/fault.c
> @@ -155,7 +155,7 @@ good_area:
> */
>
> survive:
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
> #ifdef DEBUG
> printk("handle_mm_fault returns %d\n",fault);
> #endif
> diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
> index 55767ad..6751ce9 100644
> --- a/arch/mips/mm/fault.c
> +++ b/arch/mips/mm/fault.c
> @@ -102,7 +102,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
> index 33cf250..a62e1e1 100644
> --- a/arch/mn10300/mm/fault.c
> +++ b/arch/mn10300/mm/fault.c
> @@ -258,7 +258,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
> index 92c7fa4..bfb6dd6 100644
> --- a/arch/parisc/mm/fault.c
> +++ b/arch/parisc/mm/fault.c
> @@ -202,7 +202,7 @@ good_area:
> * fault.
> */
>
> - fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) != 0);
> + fault = handle_mm_fault(mm, vma, address, (acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> /*
> * We hit a shared mapping outside of the file, or some
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 7699394..e2bf1ee 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -299,7 +299,7 @@ good_area:
> * the fault.
> */
> survive:
> - ret = handle_mm_fault(mm, vma, address, is_write);
> + ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(ret & VM_FAULT_ERROR)) {
> if (ret & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/powerpc/platforms/cell/spu_fault.c b/arch/powerpc/platforms/cell/spu_fault.c
> index 95d8dad..d06ba87 100644
> --- a/arch/powerpc/platforms/cell/spu_fault.c
> +++ b/arch/powerpc/platforms/cell/spu_fault.c
> @@ -70,7 +70,7 @@ int spu_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
> }
>
> ret = 0;
> - *flt = handle_mm_fault(mm, vma, ea, is_write);
> + *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(*flt & VM_FAULT_ERROR)) {
> if (*flt & VM_FAULT_OOM) {
> ret = -ENOMEM;
> diff --git a/arch/s390/lib/uaccess_pt.c b/arch/s390/lib/uaccess_pt.c
> index b0b84c3..cb5d59e 100644
> --- a/arch/s390/lib/uaccess_pt.c
> +++ b/arch/s390/lib/uaccess_pt.c
> @@ -66,7 +66,7 @@ static int __handle_fault(struct mm_struct *mm, unsigned long address,
> }
>
> survive:
> - fault = handle_mm_fault(mm, vma, address, write_access);
> + fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index 833e836..31456fa 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -351,7 +351,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM) {
> up_read(&mm->mmap_sem);
> diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
> index 31a33eb..09ef52a 100644
> --- a/arch/sh/mm/fault_32.c
> +++ b/arch/sh/mm/fault_32.c
> @@ -133,7 +133,7 @@ good_area:
> * the fault.
> */
> survive:
> - fault = handle_mm_fault(mm, vma, address, writeaccess);
> + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
> index 7876997..fcbb6e1 100644
> --- a/arch/sh/mm/tlbflush_64.c
> +++ b/arch/sh/mm/tlbflush_64.c
> @@ -187,7 +187,7 @@ good_area:
> * the fault.
> */
> survive:
> - fault = handle_mm_fault(mm, vma, address, writeaccess);
> + fault = handle_mm_fault(mm, vma, address, writeaccess ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
> index 12e447f..a5e30c6 100644
> --- a/arch/sparc/mm/fault_32.c
> +++ b/arch/sparc/mm/fault_32.c
> @@ -241,7 +241,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> @@ -484,7 +484,7 @@ good_area:
> if(!(vma->vm_flags & (VM_READ | VM_EXEC)))
> goto bad_area;
> }
> - switch (handle_mm_fault(mm, vma, address, write)) {
> + switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
> case VM_FAULT_SIGBUS:
> case VM_FAULT_OOM:
> goto do_sigbus;
> diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
> index 4ab8993..e5620b2 100644
> --- a/arch/sparc/mm/fault_64.c
> +++ b/arch/sparc/mm/fault_64.c
> @@ -398,7 +398,7 @@ good_area:
> goto bad_area;
> }
>
> - fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE));
> + fault = handle_mm_fault(mm, vma, address, (fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
> index 7384d8a..637c650 100644
> --- a/arch/um/kernel/trap.c
> +++ b/arch/um/kernel/trap.c
> @@ -65,7 +65,7 @@ good_area:
> do {
> int fault;
>
> - fault = handle_mm_fault(mm, vma, address, is_write);
> + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM) {
> goto out_of_memory;
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index a03b727..65a07ba 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1130,7 +1130,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault:
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
>
> if (unlikely(fault & VM_FAULT_ERROR)) {
> mm_fault_error(regs, error_code, address, fault);
> diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
> index bdd860d..bc07333 100644
> --- a/arch/xtensa/mm/fault.c
> +++ b/arch/xtensa/mm/fault.c
> @@ -106,7 +106,7 @@ good_area:
> * the fault.
> */
> survive:
> - fault = handle_mm_fault(mm, vma, address, is_write);
> + fault = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bff1f0d..3f207d1 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -810,11 +810,11 @@ extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
>
> #ifdef CONFIG_MMU
> extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> - unsigned long address, int write_access);
> + unsigned long address, unsigned int flags);
> #else
> static inline int handle_mm_fault(struct mm_struct *mm,
> struct vm_area_struct *vma, unsigned long address,
> - int write_access)
> + unsigned int flags)
> {
> /* should never happen if there's no MMU */
> BUG();
> diff --git a/mm/memory.c b/mm/memory.c
> index 9050bae..383dc0b 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1310,8 +1310,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
> cond_resched();
> while (!(page = follow_page(vma, start, foll_flags))) {
> int ret;
> - ret = handle_mm_fault(mm, vma, start,
> - foll_flags & FOLL_WRITE);
> +
> + /* FOLL_WRITE matches FAULT_FLAG_WRITE! */
> + ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE);
> if (ret & VM_FAULT_ERROR) {
> if (ret & VM_FAULT_OOM)
> return i ? i : -ENOMEM;
> @@ -2864,13 +2865,12 @@ unlock:
> * By the time we get here, we already hold the mm semaphore
> */
> int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> - unsigned long address, int write_access)
> + unsigned long address, unsigned int flags)
> {
> pgd_t *pgd;
> pud_t *pud;
> pmd_t *pmd;
> pte_t *pte;
> - unsigned int flags = write_access ? FAULT_FLAG_WRITE : 0;
>
> __set_current_state(TASK_RUNNING);
>
> --
> 1.6.2.2.471.g6da14.dirty
>
>

How about something like this for x86? If it looks sane, i will apply
to other arches.

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8bcb6f4..f3b6ee4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -584,11 +584,12 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
struct mm_struct *mm;
struct vm_area_struct *vma;
unsigned long address;
- int write, si_code;
+ int si_code;
int fault;
#ifdef CONFIG_X86_64
unsigned long flags;
#endif
+ unsigned int fault_flags |= FAULT_FLAG_RETRY;

/*
* We can fault from pretty much anywhere, with unknown IRQ state.
@@ -722,14 +723,13 @@ again:
*/
good_area:
si_code = SEGV_ACCERR;
- write = 0;
switch (error_code & (PF_PROT|PF_WRITE)) {
default: /* 3: write, present */
/* fall through */
case PF_WRITE: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto bad_area;
- write++;
+ fault_flags |= FAULT_FLAG_WRITE;
break;
case PF_PROT: /* read, present */
goto bad_area;
@@ -746,7 +746,7 @@ survive:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, fault_flags);
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;

--Ying

2009-04-10 19:31:08