LinuxLists.cc - [RFC v2][PATCH]page_fault retry with NOPAGE

2008-12-05 19:40:46

Subject: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

changelog[v2]:
- reduce the runtime overhead by extending the 'write' flag of
handle_mm_fault() to indicate the retry hint.
- add another two branches in filemap_fault with retry logic.
- replace find_lock_page with find_lock_page_retry to make the code
cleaner.

todo:
- there is potential a starvation hole with the retry. By the time the
retry returns, the pages might be released. we can make change by holding
page reference as well as remembering what the page "was"(in case the
file was truncated). any suggestion here are welcomed.

I also made patches for all other arch. I am posting x86_64 here first and
i will post others by the time everyone feels comfortable of this patch.

Edwin, please test this patch with your testcase and check if you get any
performance improvement of mmap over read. I added another two more places
in filemap_fault with retry logic which you might hit in your privous
experiment.

page fault retry with NOPAGE_RETRY
Allow major faults to drop the mmap_sem read lock while waiting for
synchronous disk read. This allows another thread which wishes to grab
down_write(mmap_sem) to proceed while the current is waiting the disk IO.

The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
as identify that the caller can tolerate the retry in the filemap_fault call
patch.

This patch helps a lot in cases we have writer which is waitting behind all
readers, so it could execute much faster.

Signed-off-by: Mike Waychison <[email protected]>
Signed-off-by: Ying Han <[email protected]>

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730..5cf5eff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
#ifdef CONFIG_X86_64
unsigned long flags;
#endif
+ unsigned int retry_flag = FAULT_FLAG_RETRY;

tsk = current;
mm = tsk->mm;
@@ -689,6 +690,7 @@ again:
down_read(&mm->mmap_sem);
}

+retry:
vma = find_vma(mm, address);
if (!vma)
goto bad_area;
@@ -715,6 +717,7 @@ again:
good_area:
si_code = SEGV_ACCERR;
write = 0;
+ write |= retry_flag;
switch (error_code & (PF_PROT|PF_WRITE)) {
default: /* 3: write, present */
/* fall through */
@@ -743,6 +746,15 @@ good_area:
goto do_sigbus;
BUG();
}
+
+ if (fault & VM_FAULT_RETRY) {
+ if (write & FAULT_FLAG_RETRY) {
+ retry_flag &= ~FAULT_FLAG_RETRY;
+ goto retry;
+ }
+ BUG();
+ }
+
if (fault & VM_FAULT_MAJOR)
tsk->maj_flt++;
else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ffee2f7..9cc65a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -144,7 +144,7 @@ extern pgprot_t protection_map[16];

#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
-
+#define FAULT_FLAG_RETRY 0x04 /* Retry majoy fault */

/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
+#define VM_FAULT_RETRY 0x0010

#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f89..aab4a08 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -714,6 +714,56 @@ repeat:
EXPORT_SYMBOL(find_lock_page);

/**
+ * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
+ * flag is on, and page is already locked by someone else, return a hint of
+ * retry.
+ * @mapping: the address_space to search
+ * @offset: the page index
+ * @vma: vma in which the fault was taken
+ * @page: zero if page not present, otherwise point to the page in
+ * pagecache.
+ * @retry: 1 indicate caller tolerate a retry.
+ *
+ * Return *page==NULL if page is not in pagecache. Otherwise return *page
+ * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
+ * hint to caller for retry, or ret=0 which means page is succefully
+ * locked.
+ */
+unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
+ struct vm_area_struct *vma, struct page **page,
+ int retry)
+{
+ unsigned int ret = 0;
+
+repeat:
+ *page = find_get_page(mapping, offset);
+ if (*page) {
+ if (!retry)
+ lock_page(*page);
+ else {
+ if (!trylock_page(*page)) {
+ struct mm_struct *mm = vma->vm_mm;
+
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(*page);
+ down_read(&mm->mmap_sem);
+
+ page_cache_release(*page);
+ return VM_FAULT_RETRY;
+ }
+ }
+ if (unlikely((*page)->mapping != mapping)) {
+ unlock_page(*page);
+ page_cache_release(*page);
+ goto repeat;
+ }
+ VM_BUG_ON((*page)->index != offset);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(find_lock_page_retry);
+
+/**
* find_or_create_page - locate or add a pagecache page
* @mapping: the page's address_space
* @index: the page's index into the mapping
@@ -1444,6 +1494,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
pgoff_t size;
int did_readaround = 0;
int ret = 0;
+ int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
+ int retry_ret;

size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size)
@@ -1458,6 +1510,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
*/
retry_find:
page = find_lock_page(mapping, vmf->pgoff);
+
+retry_find_nopage:
/*
* For sequential accesses, we use the generic readahead logic.
*/
@@ -1465,9 +1519,12 @@ retry_find:
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
- page = find_lock_page(mapping, vmf->pgoff);
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
if (!page)
goto no_cached_page;
+ if (retry_ret == VM_FAULT_RETRY)
+ return retry_ret;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping, ra, file, page,
@@ -1504,14 +1561,18 @@ retry_find:
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
- page = find_lock_page(mapping, vmf->pgoff);
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
if (!page)
goto no_cached_page;
+ if (retry_ret == VM_FAULT_RETRY)
+ return retry_ret;
}

if (!did_readaround)
ra->mmap_miss--;

+retry_page_update:
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1547,8 +1608,23 @@ no_cached_page:
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
- if (error >= 0)
- goto retry_find;
+ if (error >= 0) {
+ /*
+ * If caller cannot tolerate a retry in the ->fault path
+ * go back to check the page again.
+ */
+ if (!retry_flag)
+ goto retry_find;
+
+ retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
+ vma, &page, retry_flag);
+ if (!page)
+ goto retry_find_nopage;
+ else if (retry_ret == VM_FAULT_RETRY)
+ return retry_ret;
+ else
+ goto retry_page_update;
+ }

/*
* An error return from page_cache_read can result if the
diff --git a/mm/memory.c b/mm/memory.c
index 164951c..1ff37f7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
vmf.page = NULL;

ret = vma->vm_ops->fault(vma, &vmf);
+
+ /* page may be available, but we have to restart the process
+ * because mmap_sem was dropped during the ->fault
+ */
+ if (ret == VM_FAULT_RETRY)
+ return ret;
+
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;

@@ -2613,6 +2620,7 @@ static int do_linear_fault(struct mm_struct *mm, struct
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);

+ flags |= (write_access & FAULT_FLAG_RETRY);
pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}

2008-12-06 09:52:25

by Török Edwin

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

On 2008-12-05 21:40, Ying Han wrote:
> changelog[v2]:
> - reduce the runtime overhead by extending the 'write' flag of
> handle_mm_fault() to indicate the retry hint.
> - add another two branches in filemap_fault with retry logic.
> - replace find_lock_page with find_lock_page_retry to make the code
> cleaner.
>
> todo:
> - there is potential a starvation hole with the retry. By the time the
> retry returns, the pages might be released. we can make change by holding
> page reference as well as remembering what the page "was"(in case the
> file was truncated). any suggestion here are welcomed.
>
> I also made patches for all other arch. I am posting x86_64 here first and
> i will post others by the time everyone feels comfortable of this patch.
>
> Edwin, please test this patch with your testcase and check if you get any
> performance improvement of mmap over read. I added another two more places
> in filemap_fault with retry logic which you might hit in your privous
> experiment.
>

I get much better results with this patch than with v1, thanks!

mmap now scales almost as well as read does (there is a small ~5%
overhead), which is a significant improvement over not scaling at all!

Here are the results when running my testcase:

Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
read, mmap, mixed, read, mmap, mixed
2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3

Here are the /proc/lock_stat output when running my testcase, contention
is lower (34911+10462 vs 58590+7231), and waittime-total is better
(57 601 464 vs 234 170 024)

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
&mm->mmap_sem-W: 5843
10462 2.89 138824.72 14217159.52
18965 84205 1.81 5031.07 725293.65
&mm->mmap_sem-R: 20208
34911 4.87 136797.26 57601464.49 55797
1110394 1.89 164918.52 30551371.71
---------------
&mm->mmap_sem 5341
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
&mm->mmap_sem 28579
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
&mm->mmap_sem 5030
[<ffffffff80211161>] sys_mmap+0xf1/0x140
&mm->mmap_sem 6331
[<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
---------------
&mm->mmap_sem 13558
[<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
&mm->mmap_sem 4694
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
&mm->mmap_sem 3681
[<ffffffff80211161>] sys_mmap+0xf1/0x140
&mm->mmap_sem 23374
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0

On clamd:

Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
514 673), and number of contentions on read
(458 052 vs 5851

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total acq-bounces
acquisitions holdtime-min holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

&mm->mmap_sem-W: 346769
533541 1.62 99819.40 454843342.63
411259 588486 1.33 6719.62 2395987.75
&mm->mmap_sem-R: 197856
458052 1.59 99800.28 313508721.01
338158 653427 1.71 25421.10 1493154.95
---------------
&mm->mmap_sem 427857
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
&mm->mmap_sem 266464
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
&mm->mmap_sem 251689
[<ffffffff802110d6>] sys_mmap+0x66/0x140
&mm->mmap_sem 15187
[<ffffffff80211161>] sys_mmap+0xf1/0x140
---------------
&mm->mmap_sem 226908
[<ffffffff802110d6>] sys_mmap+0x66/0x140
&mm->mmap_sem 483909
[<ffffffff805d1c62>] do_page_fault+0x172/0xab0
&mm->mmap_sem 229404
[<ffffffff802bf9d7>] sys_munmap+0x47/0x80
&mm->mmap_sem 13229
[<ffffffff80211161>] sys_mmap+0xf1/0x140

...............................................................................................................................................................................................

&sem->wait_lock: 112617
114394 0.41 111.20 225590.14
1517470 6300681 0.27 4103.77 3814684.55
---------------
&sem->wait_lock 5634
[<ffffffff8043a608>] __up_write+0x28/0x170
&sem->wait_lock 13595
[<ffffffff805ce4dc>] __down_read+0x1c/0xbc
&sem->wait_lock 38882
[<ffffffff8043a4a0>] __down_read_trylock+0x20/0x60
&sem->wait_lock 30718
[<ffffffff8043a773>] __up_read+0x23/0xc0
---------------
&sem->wait_lock 21389
[<ffffffff8043a4a0>] __down_read_trylock+0x20/0x60
&sem->wait_lock 48959
[<ffffffff8043a608>] __up_write+0x28/0x170
&sem->wait_lock 24330
[<ffffffff8043a773>] __up_read+0x23/0xc0
&sem->wait_lock 9000
[<ffffffff805ce4dc>] __down_read+0x1c/0xbc

> @@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004
> #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
> +#define VM_FAULT_RETRY 0x0010
>
> #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page
>

The patch got damaged here, and failed to apply, I added the missing */,
and then git-am -3 applied it.

Best regards,
--Edwin

2008-12-06 09:55:55

by Török Edwin

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

On 2008-12-06 11:52, T?r?k Edwin wrote:
> On 2008-12-05 21:40, Ying Han wrote:
>
>> changelog[v2]:
>> - reduce the runtime overhead by extending the 'write' flag of
>> handle_mm_fault() to indicate the retry hint.
>> - add another two branches in filemap_fault with retry logic.
>> - replace find_lock_page with find_lock_page_retry to make the code
>> cleaner.
>>
>> todo:
>> - there is potential a starvation hole with the retry. By the time the
>> retry returns, the pages might be released. we can make change by holding
>> page reference as well as remembering what the page "was"(in case the
>> file was truncated). any suggestion here are welcomed.
>>
>> I also made patches for all other arch. I am posting x86_64 here first and
>> i will post others by the time everyone feels comfortable of this patch.
>>
>> Edwin, please test this patch with your testcase and check if you get any
>> performance improvement of mmap over read. I added another two more places
>> in filemap_fault with retry logic which you might hit in your privous
>> experiment.
>>
>>
>
> I get much better results with this patch than with v1, thanks!
>
> mmap now scales almost as well as read does (there is a small ~5%
> overhead), which is a significant improvement over not scaling at all!
>
> Here are the results when running my testcase:
>
> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
> read, mmap, mixed, read, mmap, mixed
> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>
>
> Here are the /proc/lock_stat output when running my testcase, contention
> is lower (34911+10462 vs 58590+7231), and waittime-total is better
> (57 601 464 vs 234 170 024)
>
> lock_stat version 0.3
> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> class name con-bounces contentions
> waittime-min waittime-max waittime-total acq-bounces
> acquisitions holdtime-min holdtime-max holdtime-total
> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> &mm->mmap_sem-W: 5843
> 10462 2.89 138824.72 14217159.52
> 18965 84205 1.81 5031.07 725293.65
> &mm->mmap_sem-R: 20208
> 34911 4.87 136797.26 57601464.49 55797
> 1110394 1.89 164918.52 30551371.71
> ---------------
> &mm->mmap_sem 5341
> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
> &mm->mmap_sem 28579
> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
> &mm->mmap_sem 5030
> [<ffffffff80211161>] sys_mmap+0xf1/0x140
> &mm->mmap_sem 6331
> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
> ---------------
> &mm->mmap_sem 13558
> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
> &mm->mmap_sem 4694
> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
> &mm->mmap_sem 3681
> [<ffffffff80211161>] sys_mmap+0xf1/0x140
> &mm->mmap_sem 23374
> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>
>
> On clamd:
>
> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
> 514 673), and number of contentions on read
> (458 052 vs 5851

typo, should have been: 458 052 vs 585 119

2008-12-08 01:44:00

by Ying Han

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

Thanks T?r?k for your experiment and that sounds great !

--Ying

On Sat, Dec 6, 2008 at 1:55 AM, T?r?k Edwin <[email protected]> wrote:
> On 2008-12-06 11:52, T?r?k Edwin wrote:
>> On 2008-12-05 21:40, Ying Han wrote:
>>
>>> changelog[v2]:
>>> - reduce the runtime overhead by extending the 'write' flag of
>>> handle_mm_fault() to indicate the retry hint.
>>> - add another two branches in filemap_fault with retry logic.
>>> - replace find_lock_page with find_lock_page_retry to make the code
>>> cleaner.
>>>
>>> todo:
>>> - there is potential a starvation hole with the retry. By the time the
>>> retry returns, the pages might be released. we can make change by holding
>>> page reference as well as remembering what the page "was"(in case the
>>> file was truncated). any suggestion here are welcomed.
>>>
>>> I also made patches for all other arch. I am posting x86_64 here first and
>>> i will post others by the time everyone feels comfortable of this patch.
>>>
>>> Edwin, please test this patch with your testcase and check if you get any
>>> performance improvement of mmap over read. I added another two more places
>>> in filemap_fault with retry logic which you might hit in your privous
>>> experiment.
>>>
>>>
>>
>> I get much better results with this patch than with v1, thanks!
>>
>> mmap now scales almost as well as read does (there is a small ~5%
>> overhead), which is a significant improvement over not scaling at all!
>>
>> Here are the results when running my testcase:
>>
>> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
>> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
>> read, mmap, mixed, read, mmap, mixed
>> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
>> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>>
>>
>> Here are the /proc/lock_stat output when running my testcase, contention
>> is lower (34911+10462 vs 58590+7231), and waittime-total is better
>> (57 601 464 vs 234 170 024)
>>
>> lock_stat version 0.3
>> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>> class name con-bounces contentions
>> waittime-min waittime-max waittime-total acq-bounces
>> acquisitions holdtime-min holdtime-max holdtime-total
>> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>> &mm->mmap_sem-W: 5843
>> 10462 2.89 138824.72 14217159.52
>> 18965 84205 1.81 5031.07 725293.65
>> &mm->mmap_sem-R: 20208
>> 34911 4.87 136797.26 57601464.49 55797
>> 1110394 1.89 164918.52 30551371.71
>> ---------------
>> &mm->mmap_sem 5341
>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>> &mm->mmap_sem 28579
>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>> &mm->mmap_sem 5030
>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>> &mm->mmap_sem 6331
>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>> ---------------
>> &mm->mmap_sem 13558
>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>> &mm->mmap_sem 4694
>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>> &mm->mmap_sem 3681
>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>> &mm->mmap_sem 23374
>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>
>>
>> On clamd:
>>
>> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
>> 514 673), and number of contentions on read
>> (458 052 vs 5851
>
> typo, should have been: 458 052 vs 585 119
>
>

2008-12-09 17:58:11

by Ying Han

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

Andrew, if it sounds good to include this patch in -mm at this point?

thanks

--Ying

On Sun, Dec 7, 2008 at 5:43 PM, Ying Han <[email protected]> wrote:
> Thanks T?r?k for your experiment and that sounds great !
>
> --Ying
>
> On Sat, Dec 6, 2008 at 1:55 AM, T?r?k Edwin <[email protected]> wrote:
>> On 2008-12-06 11:52, T?r?k Edwin wrote:
>>> On 2008-12-05 21:40, Ying Han wrote:
>>>
>>>> changelog[v2]:
>>>> - reduce the runtime overhead by extending the 'write' flag of
>>>> handle_mm_fault() to indicate the retry hint.
>>>> - add another two branches in filemap_fault with retry logic.
>>>> - replace find_lock_page with find_lock_page_retry to make the code
>>>> cleaner.
>>>>
>>>> todo:
>>>> - there is potential a starvation hole with the retry. By the time the
>>>> retry returns, the pages might be released. we can make change by holding
>>>> page reference as well as remembering what the page "was"(in case the
>>>> file was truncated). any suggestion here are welcomed.
>>>>
>>>> I also made patches for all other arch. I am posting x86_64 here first and
>>>> i will post others by the time everyone feels comfortable of this patch.
>>>>
>>>> Edwin, please test this patch with your testcase and check if you get any
>>>> performance improvement of mmap over read. I added another two more places
>>>> in filemap_fault with retry logic which you might hit in your privous
>>>> experiment.
>>>>
>>>>
>>>
>>> I get much better results with this patch than with v1, thanks!
>>>
>>> mmap now scales almost as well as read does (there is a small ~5%
>>> overhead), which is a significant improvement over not scaling at all!
>>>
>>> Here are the results when running my testcase:
>>>
>>> Number of threads ->, 1,,, 2,,, 4,,, 8,,, 16
>>> Kernel version, read, mmap, mixed, read, mmap, mixed, read, mmap, mixed,
>>> read, mmap, mixed, read, mmap, mixed
>>> 2.6.28-rc7-tip, 27.55, 26.18, 27.06, 16.18, 16.97, 16.10, 11.06, 11.64,
>>> 11.41, 9.38, 9.97, 9.31, 9.37, 9.82, 9.3
>>>
>>>
>>> Here are the /proc/lock_stat output when running my testcase, contention
>>> is lower (34911+10462 vs 58590+7231), and waittime-total is better
>>> (57 601 464 vs 234 170 024)
>>>
>>> lock_stat version 0.3
>>> -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>> class name con-bounces contentions
>>> waittime-min waittime-max waittime-total acq-bounces
>>> acquisitions holdtime-min holdtime-max holdtime-total
>>> ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>>> &mm->mmap_sem-W: 5843
>>> 10462 2.89 138824.72 14217159.52
>>> 18965 84205 1.81 5031.07 725293.65
>>> &mm->mmap_sem-R: 20208
>>> 34911 4.87 136797.26 57601464.49 55797
>>> 1110394 1.89 164918.52 30551371.71
>>> ---------------
>>> &mm->mmap_sem 5341
>>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>> &mm->mmap_sem 28579
>>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>> &mm->mmap_sem 5030
>>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>> &mm->mmap_sem 6331
>>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>> ---------------
>>> &mm->mmap_sem 13558
>>> [<ffffffff802a675e>] find_lock_page_retry+0xde/0xf0
>>> &mm->mmap_sem 4694
>>> [<ffffffff802bf9d7>] sys_munmap+0x47/0x80
>>> &mm->mmap_sem 3681
>>> [<ffffffff80211161>] sys_mmap+0xf1/0x140
>>> &mm->mmap_sem 23374
>>> [<ffffffff805d1c62>] do_page_fault+0x172/0xab0
>>>
>>>
>>> On clamd:
>>>
>>> Here holdtime-total is better (1 493 154 + 2 395 987 vs 2 087 538 + 2
>>> 514 673), and number of contentions on read
>>> (458 052 vs 5851
>>
>> typo, should have been: 458 052 vs 585 119
>>
>>
>

2008-12-09 19:32:14

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

On Fri, 5 Dec 2008 11:40:19 -0800
Ying Han <[email protected]> wrote:

> changelog[v2]:
> - reduce the runtime overhead by extending the 'write' flag of
> handle_mm_fault() to indicate the retry hint.
> - add another two branches in filemap_fault with retry logic.
> - replace find_lock_page with find_lock_page_retry to make the code
> cleaner.
>
> todo:
> - there is potential a starvation hole with the retry. By the time the
> retry returns, the pages might be released. we can make change by holding
> page reference as well as remembering what the page "was"(in case the
> file was truncated). any suggestion here are welcomed.
>
> I also made patches for all other arch. I am posting x86_64 here first and
> i will post others by the time everyone feels comfortable of this patch.
>
> Edwin, please test this patch with your testcase and check if you get any
> performance improvement of mmap over read. I added another two more places
> in filemap_fault with retry logic which you might hit in your privous
> experiment.
>
>
> page fault retry with NOPAGE_RETRY
> Allow major faults to drop the mmap_sem read lock while waiting for
> synchronous disk read. This allows another thread which wishes to grab
> down_write(mmap_sem) to proceed while the current is waiting the disk IO.
>
> The patch extend the 'write' flag of handle_mm_fault() to FAULT_FLAG_RETRY
> as identify that the caller can tolerate the retry in the filemap_fault call
> patch.
>
> This patch helps a lot in cases we have writer which is waitting behind all
> readers, so it could execute much faster.
>
>
> Signed-off-by: Mike Waychison <[email protected]>
> Signed-off-by: Ying Han <[email protected]>
>

It would be useful if the changelog were to describe the performance
benefits of the patch. I mean, the whole point of the patch is to
improve throughpuyt/latency/etc, but we see no evidence here that it
_does_ this?

> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 31e8730..5cf5eff 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
> #ifdef CONFIG_X86_64
> unsigned long flags;
> #endif
> + unsigned int retry_flag = FAULT_FLAG_RETRY;
>
> tsk = current;
> mm = tsk->mm;
> @@ -689,6 +690,7 @@ again:
> down_read(&mm->mmap_sem);
> }
>
> +retry:
> vma = find_vma(mm, address);
> if (!vma)
> goto bad_area;
> @@ -715,6 +717,7 @@ again:
> good_area:
> si_code = SEGV_ACCERR;
> write = 0;
> + write |= retry_flag;
> switch (error_code & (PF_PROT|PF_WRITE)) {
> default: /* 3: write, present */
> /* fall through */
> @@ -743,6 +746,15 @@ good_area:
> goto do_sigbus;
> BUG();
> }
> +
> + if (fault & VM_FAULT_RETRY) {
> + if (write & FAULT_FLAG_RETRY) {
> + retry_flag &= ~FAULT_FLAG_RETRY;

What are we doing here? We appear to be retrying the fault once, then
we switch to synchronous mode? There is no description of this in the
changelog and there is no comment explaining the reasons for this
design. There is no way in which readers of this code will be able to
understand why it is implemented in this fashion.

And a *lot* of people will want to know why this was done this way.
Starting with about twenty arch maintainers!

Please add a comment which completely describes this code section.

> + goto retry;
> + }
> + BUG();
> + }
> +
> if (fault & VM_FAULT_MAJOR)
> tsk->maj_flt++;
> else
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index ffee2f7..9cc65a5 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -144,7 +144,7 @@ extern pgprot_t protection_map[16];
>
> #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
> #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
> -
> +#define FAULT_FLAG_RETRY 0x04 /* Retry majoy fault */
>
> /*
> * vm_fault is filled by the the pagefault handler and passed to the vma's
> @@ -694,6 +694,7 @@ static inline int page_mapped(struct page *page)
> #define VM_FAULT_SIGBUS 0x0002
> #define VM_FAULT_MAJOR 0x0004
> #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
> +#define VM_FAULT_RETRY 0x0010
>
> #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page
> #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f3e5f89..aab4a08 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -714,6 +714,56 @@ repeat:
> EXPORT_SYMBOL(find_lock_page);
>
> /**
> + * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
> + * flag is on, and page is already locked by someone else, return a hint of
> + * retry.

kerneldoc does not support linewrapping in this context. It has to be
done on a single line. Please see the appended patch.

> + * @mapping: the address_space to search
> + * @offset: the page index
> + * @vma: vma in which the fault was taken
> + * @page: zero if page not present, otherwise point to the page in
> + * pagecache.

Ditto here.

> + * @retry: 1 indicate caller tolerate a retry.
> + *
> + * Return *page==NULL if page is not in pagecache. Otherwise return *page
> + * points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
> + * hint to caller for retry, or ret=0 which means page is succefully
> + * locked.
> + */
> +unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
> + struct vm_area_struct *vma, struct page **page,
> + int retry)
> +{
> + unsigned int ret = 0;
> +
> +repeat:
> + *page = find_get_page(mapping, offset);
> + if (*page) {
> + if (!retry)
> + lock_page(*page);
> + else {
> + if (!trylock_page(*page)) {
> + struct mm_struct *mm = vma->vm_mm;
> +
> + up_read(&mm->mmap_sem);
> + wait_on_page_locked(*page);
> + down_read(&mm->mmap_sem);
> +
> + page_cache_release(*page);
> + return VM_FAULT_RETRY;
> + }
> + }
> + if (unlikely((*page)->mapping != mapping)) {
> + unlock_page(*page);
> + page_cache_release(*page);
> + goto repeat;
> + }
> + VM_BUG_ON((*page)->index != offset);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL(find_lock_page_retry);

The patch adds no declaration for find_lock_page_retry() in any header
file. See appended patch.

The frequent dereferencing of `page' is a bit ungainly, and adds risk
that gcc will generate poor code (my version of gcc does manage to do
the right thing, however). I do think the code woukld look better if
we added a local for this. See appended patch.

MM developers expect a variable called `page' to have type `struct page
*'. This function violates that by adding `struct page **page'. See
appended patch.

> +/**
> * find_or_create_page - locate or add a pagecache page
> * @mapping: the page's address_space
> * @index: the page's index into the mapping
> @@ -1444,6 +1494,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> pgoff_t size;
> int did_readaround = 0;
> int ret = 0;
> + int retry_flag = vmf->flags & FAULT_FLAG_RETRY;
> + int retry_ret;
>
> size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
> if (vmf->pgoff >= size)
> @@ -1458,6 +1510,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_
> */
> retry_find:
> page = find_lock_page(mapping, vmf->pgoff);
> +
> +retry_find_nopage:
> /*
> * For sequential accesses, we use the generic readahead logic.
> */
> @@ -1465,9 +1519,12 @@ retry_find:
> if (!page) {
> page_cache_sync_readahead(mapping, ra, file,
> vmf->pgoff, 1);
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> if (!page)
> goto no_cached_page;
> + if (retry_ret == VM_FAULT_RETRY)
> + return retry_ret;
> }
> if (PageReadahead(page)) {
> page_cache_async_readahead(mapping, ra, file, page,
> @@ -1504,14 +1561,18 @@ retry_find:
> start = vmf->pgoff - ra_pages / 2;
> do_page_cache_readahead(mapping, file, start, ra_pages);
> }
> - page = find_lock_page(mapping, vmf->pgoff);
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> if (!page)
> goto no_cached_page;
> + if (retry_ret == VM_FAULT_RETRY)
> + return retry_ret;
> }
>
> if (!did_readaround)
> ra->mmap_miss--;
>
> +retry_page_update:
> /*
> * We have a locked page in the page cache, now we need to check
> * that it's up-to-date. If not, it is going to be due to an error.
> @@ -1547,8 +1608,23 @@ no_cached_page:
> * In the unlikely event that someone removed it in the
> * meantime, we'll just come back here and read it again.
> */
> - if (error >= 0)
> - goto retry_find;
> + if (error >= 0) {
> + /*
> + * If caller cannot tolerate a retry in the ->fault path
> + * go back to check the page again.
> + */
> + if (!retry_flag)
> + goto retry_find;
> +
> + retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> + vma, &page, retry_flag);
> + if (!page)
> + goto retry_find_nopage;
> + else if (retry_ret == VM_FAULT_RETRY)
> + return retry_ret;
> + else
> + goto retry_page_update;
> + }
>
> /*
> * An error return from page_cache_read can result if the
> diff --git a/mm/memory.c b/mm/memory.c
> index 164951c..1ff37f7 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2467,6 +2467,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_a
> vmf.page = NULL;
>
> ret = vma->vm_ops->fault(vma, &vmf);
> +
> + /* page may be available, but we have to restart the process
> + * because mmap_sem was dropped during the ->fault
> + */
> + if (ret == VM_FAULT_RETRY)
> + return ret;

Shouldn't this be

if (ret & VM_FAULT_RETRY)

?

It may not make any difference in practice with present ->fault
implementations, but that's through sheer luck.

> if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
> return ret;
>
> @@ -2613,6 +2620,7 @@ static int do_linear_fault(struct mm_struct *mm, struct
> - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
> unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>
> + flags |= (write_access & FAULT_FLAG_RETRY);
> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }

include/linux/pagemap.h | 3 +++
mm/filemap.c | 38 ++++++++++++++++++++------------------
2 files changed, 23 insertions(+), 18 deletions(-)

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix arch/x86/mm/fault.c
diff -puN include/linux/mm.h~page_fault-retry-with-nopage_retry-fix include/linux/mm.h
diff -puN mm/filemap.c~page_fault-retry-with-nopage_retry-fix mm/filemap.c
--- a/mm/filemap.c~page_fault-retry-with-nopage_retry-fix
+++ a/mm/filemap.c
@@ -714,51 +714,53 @@ repeat:
EXPORT_SYMBOL(find_lock_page);

/**
- * find_lock_page_retry - locate, pin and lock a pagecache page, if retry
- * flag is on, and page is already locked by someone else, return a hint of
- * retry.
+ * find_lock_page_retry - locate, pin and lock a pagecache page
* @mapping: the address_space to search
* @offset: the page index
* @vma: vma in which the fault was taken
- * @page: zero if page not present, otherwise point to the page in
- * pagecache.
+ * @ppage: zero if page not present, otherwise point to the page in pagecache.
* @retry: 1 indicate caller tolerate a retry.
*
- * Return *page==NULL if page is not in pagecache. Otherwise return *page
+ * If retry flag is on, and page is already locked by someone else, return a
+ * hint of retry.
+ *
+ * Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
* points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
- * hint to caller for retry, or ret=0 which means page is succefully
+ * hint to caller for retry, or ret=0 which means page is successfully
* locked.
*/
unsigned find_lock_page_retry(struct address_space *mapping, pgoff_t offset,
- struct vm_area_struct *vma, struct page **page,
+ struct vm_area_struct *vma, struct page **ppage,
int retry)
{
unsigned int ret = 0;
+ struct page *page;

repeat:
- *page = find_get_page(mapping, offset);
- if (*page) {
+ page = find_get_page(mapping, offset);
+ if (page) {
if (!retry)
- lock_page(*page);
+ lock_page(page);
else {
- if (!trylock_page(*page)) {
+ if (!trylock_page(page)) {
struct mm_struct *mm = vma->vm_mm;

up_read(&mm->mmap_sem);
- wait_on_page_locked(*page);
+ wait_on_page_locked(page);
down_read(&mm->mmap_sem);

- page_cache_release(*page);
+ page_cache_release(page);
return VM_FAULT_RETRY;
}
}
- if (unlikely((*page)->mapping != mapping)) {
- unlock_page(*page);
- page_cache_release(*page);
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
goto repeat;
}
- VM_BUG_ON((*page)->index != offset);
+ VM_BUG_ON(page->index != offset);
}
+ *ppage = page;
return ret;
}
EXPORT_SYMBOL(find_lock_page_retry);
diff -puN mm/memory.c~page_fault-retry-with-nopage_retry-fix mm/memory.c
diff -puN include/linux/pagemap.h~page_fault-retry-with-nopage_retry-fix include/linux/pagemap.h
--- a/include/linux/pagemap.h~page_fault-retry-with-nopage_retry-fix
+++ a/include/linux/pagemap.h
@@ -232,6 +232,9 @@ extern struct page * find_get_page(struc
pgoff_t index);
extern struct page * find_lock_page(struct address_space *mapping,
pgoff_t index);
+extern unsigned find_lock_page_retry(struct address_space *mapping,
+ pgoff_t offset, struct vm_area_struct *vma,
+ struct page **ppage, int retry)
extern struct page * find_or_create_page(struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
_

2009-01-26 19:38:31

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

On Fri, 5 Dec 2008 11:40:19 -0800
Ying Han <[email protected]> wrote:

> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigne
> #ifdef CONFIG_X86_64
> unsigned long flags;
> #endif
> + unsigned int retry_flag = FAULT_FLAG_RETRY;
>
> tsk = current;
> mm = tsk->mm;
> @@ -689,6 +690,7 @@ again:
> down_read(&mm->mmap_sem);
> }
>
> +retry:
> vma = find_vma(mm, address);
> if (!vma)
> goto bad_area;
> @@ -715,6 +717,7 @@ again:
> good_area:
> si_code = SEGV_ACCERR;
> write = 0;
> + write |= retry_flag;
> switch (error_code & (PF_PROT|PF_WRITE)) {
> default: /* 3: write, present */
> /* fall through */
> @@ -743,6 +746,15 @@ good_area:
> goto do_sigbus;
> BUG();
> }
> +
> + if (fault & VM_FAULT_RETRY) {
> + if (write & FAULT_FLAG_RETRY) {
> + retry_flag &= ~FAULT_FLAG_RETRY;
> + goto retry;
> + }
> + BUG();
> + }
> +
> if (fault & VM_FAULT_MAJOR)
> tsk->maj_flt++;
> else

This code is mixing flags from the FAULT_FLAG_foor domain into local
variable `write'. But that's inappropriate because `write' is a
boolean, and in one of Ingo's trees, `write' gets bits other than bit 0
set, and it all generally ends up a mess.

Can we not do that? I assume that a previous version of this patch
kept those things separated?

Something like this, I think?

diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix arch/x86/mm/fault.c
--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
+++ a/arch/x86/mm/fault.c
@@ -799,7 +799,7 @@ void __kprobes do_page_fault(struct pt_r
struct vm_area_struct *vma;
int write;
int fault;
- unsigned int retry_flag = FAULT_FLAG_RETRY;
+ int retry_flag = 1;

tsk = current;
mm = tsk->mm;
@@ -951,6 +951,7 @@ good_area:
}

write |= retry_flag;
+
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@@ -969,8 +970,8 @@ good_area:
* be removed or changed after the retry.
*/
if (fault & VM_FAULT_RETRY) {
- if (write & FAULT_FLAG_RETRY) {
- retry_flag &= ~FAULT_FLAG_RETRY;
+ if (retry_flag) {
+ retry_flag = 0;
goto retry;
}
BUG();

Question: why is this code passing `write==true' into handle_mm_fault()
in the retry case?

2009-01-26 23:53:50

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

On Mon, 26 Jan 2009 15:08:48 -0800
Ying Han <[email protected]> wrote:

> On Mon, Jan 26, 2009 at 11:37 AM, Andrew Morton
> <[email protected]>wrote:
>
> > On Fri, 5 Dec 2008 11:40:19 -0800
> > Ying Han <[email protected]> wrote:
> >
> > > --- a/arch/x86/mm/fault.c
> > > +++ b/arch/x86/mm/fault.c
> > > @@ -591,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs,
> > unsigne
> > > #ifdef CONFIG_X86_64
> > > unsigned long flags;
> > > #endif
> > > + unsigned int retry_flag = FAULT_FLAG_RETRY;
> > >
> > > tsk = current;
> > > mm = tsk->mm;
> > > @@ -689,6 +690,7 @@ again:
> > > down_read(&mm->mmap_sem);
> > > }
> > >
> > > +retry:
> > > vma = find_vma(mm, address);
> > > if (!vma)
> > > goto bad_area;
> > > @@ -715,6 +717,7 @@ again:
> > > good_area:
> > > si_code = SEGV_ACCERR;
> > > write = 0;
> > > + write |= retry_flag;
> > > switch (error_code & (PF_PROT|PF_WRITE)) {
> > > default: /* 3: write, present */
> > > /* fall through */
> > > @@ -743,6 +746,15 @@ good_area:
> > > goto do_sigbus;
> > > BUG();
> > > }
> > > +
> > > + if (fault & VM_FAULT_RETRY) {
> > > + if (write & FAULT_FLAG_RETRY) {
> > > + retry_flag &= ~FAULT_FLAG_RETRY;
> > > + goto retry;
> > > + }
> > > + BUG();
> > > + }
> > > +
> > > if (fault & VM_FAULT_MAJOR)
> > > tsk->maj_flt++;
> > > else
> >
> > This code is mixing flags from the FAULT_FLAG_foor domain into local
> > variable `write'. But that's inappropriate because `write' is a
> > boolean, and in one of Ingo's trees, `write' gets bits other than bit 0
> > set, and it all generally ends up a mess.
> >
> > Can we not do that? I assume that a previous version of this patch
> > kept those things separated?
> >
> > Something like this, I think?
> >
> > diff -puN arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
> > arch/x86/mm/fault.c
> > --- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix
> > +++ a/arch/x86/mm/fault.c
> > @@ -799,7 +799,7 @@ void __kprobes do_page_fault(struct pt_r
> > struct vm_area_struct *vma;
> > int write;
> > int fault;
> > - unsigned int retry_flag = FAULT_FLAG_RETRY;
> > + int retry_flag = 1;
> >
> > tsk = current;
> > mm = tsk->mm;
> > @@ -951,6 +951,7 @@ good_area:
> > }
> >
> > write |= retry_flag;
> > +
> > /*
> > * If for any reason at all we couldn't handle the fault,
> > * make sure we exit gracefully rather than endlessly redo
> > @@ -969,8 +970,8 @@ good_area:
> > * be removed or changed after the retry.
> > */
> > if (fault & VM_FAULT_RETRY) {
> > - if (write & FAULT_FLAG_RETRY) {
> > - retry_flag &= ~FAULT_FLAG_RETRY;
> > + if (retry_flag) {
> > + retry_flag = 0;
> > goto retry;
> > }
> > BUG();
>
> with this change, 'write' still gets bits other than bit 0
> set in the case of 'write, not present' and the Ingo's problem remains, am i
> missing something here?

umm, yes. This?

--- a/arch/x86/mm/fault.c~page_fault-retry-with-nopage_retry-fix-fix
+++ a/arch/x86/mm/fault.c
@@ -950,8 +950,6 @@ good_area:
return;
}

- write |= retry_flag;
-
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
_

(I should just give up here - doing too many things at once)

> >
> >
> >
> > Question: why is this code passing `write==true' into handle_mm_fault()
> > in the retry case?
> > Here i am using unused bit of "write" to carry FAULT_FLAG_RETRY flag down
> > to the handle_mm_fault(). Meanwhile, "write" still have its read/write bit
> > set as it is before. It is true that 'write == true' in the retry patch, but
> > i did the correct interpretation in
>
>
>
> > static int do_linear_fault() {
> >
> int write = write_access & ~FAULT_FLAG_RETRY;
> unsigned int flags = (write ? FAULT_FLAG_WRITE : 0);
>
> flags |= (write_access & FAULT_FLAG_RETRY);
> pte_unmap(page_table);
> return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
> }

OK, this is horridly confusing. Is `write_access' a boolean, as its
name implies, or is it a bunch of flags?

If we're going to turn it into a bunch of flags then it should be
renamed! And callsites such as do_page_fault() should rename their
local variable `write' to something which accurately conveys the new
usage. And various code comments in mm/memory.c (which don't appear to
exist) should be updated.

I think that a good way to present this is as a preparatory patch:
"convert the fourth argument to handle_mm_fault() from a boolean to a
flags word". That would be a simple do-nothing patch which affects all
architectures and which ideally would break the build at any
unconverted code sites. (Change the argument order?)

What do you think?

2009-01-26 23:57:47

by Ingo Molnar

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

* Andrew Morton <[email protected]> wrote:

> I think that a good way to present this is as a preparatory patch:
> "convert the fourth argument to handle_mm_fault() from a boolean to a
> flags word". That would be a simple do-nothing patch which affects all
> architectures and which ideally would break the build at any unconverted
> code sites. (Change the argument order?)

why not do what i suggested: refactor do_page_fault() into a platform
specific / kernel-internal faults and into a generic-user-pte function.
That alone would increase readability i suspect.

Then the 'retry' is multiple calls from handle_pte_fault().

Or something like that.

It looks wrong to me to pass another flag through this hot codepath, just
to express a property that the _highlevel_ code is interested in.

Ingo

2009-01-27 04:34:37

by KOSAKI Motohiro

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

>
> * Andrew Morton <[email protected]> wrote:
>
> > I think that a good way to present this is as a preparatory patch:
> > "convert the fourth argument to handle_mm_fault() from a boolean to a
> > flags word". That would be a simple do-nothing patch which affects all
> > architectures and which ideally would break the build at any unconverted
> > code sites. (Change the argument order?)
>
> why not do what i suggested: refactor do_page_fault() into a platform
> specific / kernel-internal faults and into a generic-user-pte function.
> That alone would increase readability i suspect.
>
> Then the 'retry' is multiple calls from handle_pte_fault().
>
> Or something like that.
>
> It looks wrong to me to pass another flag through this hot codepath, just
> to express a property that the _highlevel_ code is interested in.

I like this idea :)

2009-03-31 22:09:06

by Andrew Morton

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

2009-04-01 00:18:17

by Ying Han

[permalink] [raw]

Subject: Re: [RFC v2][PATCH]page_fault retry with NOPAGE_RETRY

Thanks Andrew. I have the patches for all the arches and i just need
to clean them up a little bit. I will send them to you ASAP.

--Ying

On Tue, Mar 31, 2009 at 3:00 PM, Andrew Morton
<[email protected]> wrote:
> On Fri, 5 Dec 2008 11:40:19 -0800
> Ying Han <[email protected]> wrote:
>
>> changelog[v2]:
>> - reduce the runtime overhead by extending the 'write' flag of
>> ? handle_mm_fault() to indicate the retry hint.
>> - add another two branches in filemap_fault with retry logic.
>> - replace find_lock_page with find_lock_page_retry to make the code
>> ? cleaner.
>>
>> todo:
>> - there is potential a starvation hole with the retry. By the time the
>> ? retry returns, the pages might be released. we can make change by holding
>> ? page reference as well as remembering what the page "was"(in case the
>> ? file was truncated). any suggestion here are welcomed.
>>
>> I also made patches for all other arch. I am posting x86_64 here first and
>> i will post others by the time everyone feels comfortable of this patch.
>
> I'm about to send this into Linus. ?What happened to the patches for
> other architectures?
>
> Please send them over when convenient and I'll work on getting them
> trickled out to arch maintainers, thanks.
>
>

2009-04-03 08:23:05

by Fengguang Wu

[permalink] [raw]

Subject: [PATCH] vfs: fix find_lock_page_retry() return value parsing

2009-04-03 08:36:32

by Fengguang Wu

[permalink] [raw]

Subject: [PATCH v2] vfs: fix find_lock_page_retry() return value parsing

find_lock_page_retry() won't touch the *ppage value when returning
VM_FAULT_RETRY. This is fine except for the case

if (VM_RandomReadHint())
goto no_cached_page;

where the 'page' could be undefined after calling find_lock_page_retry().

Fix it by checking the VM_FAULT_RETRY case first. Also do this for the
other two find_lock_page_retry() invocations for the sake of consistency.

Cc: Ying Han <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/filemap.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -759,7 +759,7 @@ EXPORT_SYMBOL(find_lock_page);
* @retry: 1 indicate caller tolerate a retry.
*
* If retry flag is on, and page is already locked by someone else, return
- * a hint of retry.
+ * a hint of retry and leave *ppage untouched.
*
* Return *ppage==NULL if page is not in pagecache. Otherwise return *ppage
* points to the page in the pagecache with ret=VM_FAULT_RETRY indicate a
@@ -1575,10 +1575,10 @@ retry_find_nopage:
vmf->pgoff, 1);
retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
vma, &page, retry_flag);
- if (!page)
- goto no_cached_page;
if (retry_ret == VM_FAULT_RETRY)
return retry_ret;
+ if (!page)
+ goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping, ra, file, page,
@@ -1617,10 +1617,10 @@ retry_find_nopage:
}
retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
vma, &page, retry_flag);
- if (!page)
- goto no_cached_page;
if (retry_ret == VM_FAULT_RETRY)
return retry_ret;
+ if (!page)
+ goto no_cached_page;
}

if (!did_readaround)
@@ -1672,10 +1672,10 @@ no_cached_page:

retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
vma, &page, retry_flag);
+ if (retry_ret == VM_FAULT_RETRY)
+ return retry_ret;
if (!page)
goto retry_find_nopage;
- else if (retry_ret == VM_FAULT_RETRY)
- return retry_ret;
else
goto retry_page_update;
}

2009-04-03 08:55:31

by Fengguang Wu

[permalink] [raw]

Subject: [PATCH] vfs: reduce page fault retry code

find_lock_page_retry() works the same way as find_lock_page()
when retry_flag=0. And their return value handling shall work
(almost) in the same way, or it will already be a bug.

So the !retry_flag special casing can be eliminated.

Cc: Ying Han <[email protected]>
Signed-off-by: Wu Fengguang <[email protected]>
---
mm/filemap.c | 7 -------
1 file changed, 7 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -1663,13 +1663,6 @@ no_cached_page:
* meantime, we'll just come back here and read it again.
*/
if (error >= 0) {
- /*
- * If caller cannot tolerate a retry in the ->fault path
- * go back to check the page again.
- */
- if (!retry_flag)
- goto retry_find;
-
retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
vma, &page, retry_flag);
if (retry_ret == VM_FAULT_RETRY)

2009-04-03 10:54:23

by Fengguang Wu

[permalink] [raw]

Subject: Re: [PATCH] vfs: reduce page fault retry code

On Fri, Apr 03, 2009 at 04:55:03PM +0800, Wu Fengguang wrote:
> find_lock_page_retry() works the same way as find_lock_page()
> when retry_flag=0. And their return value handling shall work
> (almost) in the same way, or it will already be a bug.
>
> So the !retry_flag special casing can be eliminated.
>
> Cc: Ying Han <[email protected]>
> Signed-off-by: Wu Fengguang <[email protected]>
> ---
> mm/filemap.c | 7 -------
> 1 file changed, 7 deletions(-)
>
> --- mm.orig/mm/filemap.c
> +++ mm/mm/filemap.c
> @@ -1663,13 +1663,6 @@ no_cached_page:
> * meantime, we'll just come back here and read it again.
> */
> if (error >= 0) {
> - /*
> - * If caller cannot tolerate a retry in the ->fault path
> - * go back to check the page again.
> - */
> - if (!retry_flag)
> - goto retry_find;
> -
> retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
> vma, &page, retry_flag);
> if (retry_ret == VM_FAULT_RETRY)

In fact I guess we can shrink the code more aggressively.
The only difference is the extra ra->mmap_miss--, which will
be moved to other place in another planned patch.

Thanks,
Fengguang
---
mm/filemap.c | 22 +++-------------------
1 file changed, 3 insertions(+), 19 deletions(-)

--- mm.orig/mm/filemap.c
+++ mm/mm/filemap.c
@@ -1565,7 +1565,6 @@ int filemap_fault(struct vm_area_struct
retry_find:
page = find_lock_page(mapping, vmf->pgoff);

-retry_find_nopage:
/*
* For sequential accesses, we use the generic readahead logic.
*/
@@ -1615,6 +1614,7 @@ retry_find_nopage:
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
+retry_find_retry:
retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
vma, &page, retry_flag);
if (retry_ret == VM_FAULT_RETRY)
@@ -1626,7 +1626,6 @@ retry_find_nopage:
if (!did_readaround)
ra->mmap_miss--;

-retry_page_update:
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
@@ -1662,23 +1661,8 @@ no_cached_page:
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
- if (error >= 0) {
- /*
- * If caller cannot tolerate a retry in the ->fault path
- * go back to check the page again.
- */
- if (!retry_flag)
- goto retry_find;
-
- retry_ret = find_lock_page_retry(mapping, vmf->pgoff,
- vma, &page, retry_flag);
- if (retry_ret == VM_FAULT_RETRY)
- return retry_ret;
- if (!page)
- goto retry_find_nopage;
- else
- goto retry_page_update;
- }
+ if (error >= 0)
+ goto retry_find_retry;

/*
* An error return from page_cache_read can result if the