The page can be included into collapse as long as it doesn't have extra
pins (from GUP or otherwise).
Logic to check the refcound is moved to a separate function.
Note that the function is ready to deal with compound pages. It's
preparation for the following patch.
VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
invariant it checks is no longer valid: the source can be mapped
multiple times now.
Signed-off-by: Kirill A. Shutemov <[email protected]>
---
mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
1 file changed, 30 insertions(+), 11 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e3e41c2768d8..f9864644c3b7 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
}
}
+static bool is_refcount_suitable(struct page *page)
+{
+ int expected_refcount, refcount;
+
+ refcount = page_count(page);
+ expected_refcount = total_mapcount(page);
+ if (PageSwapCache(page))
+ expected_refcount += compound_nr(page);
+
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
+ pr_err("expected_refcount: %d, refcount: %d\n",
+ expected_refcount, refcount);
+ dump_page(page, "Unexpected refcount");
+ }
+
+ return page_count(page) == expected_refcount;
+}
+
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
@@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * an additinal pin on the page.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not for the this process. It is fine. The other process
+ * cannot write to the page, only trigger CoW.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
@@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
@@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- /*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
- */
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ /* Check if the page has any GUP (or other external) pins */
+ if (!is_refcount_suitable(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
--
2.26.0
On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Logic to check the refcound is moved to a separate function.
"refcount"
> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.
Maybe:
"Added compound_nr(page) to the expected refcount, in order to handle
the compound page case. This is in preparation for the following patch."
(Just to make it clear that this is not just refactoring, but also a
change.)
>
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
> 1 file changed, 30 insertions(+), 11 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
> }
> }
>
> +static bool is_refcount_suitable(struct page *page)
> +{
This looks nice.
> + int expected_refcount, refcount;
> +
> + refcount = page_count(page);
> + expected_refcount = total_mapcount(page);
> + if (PageSwapCache(page))
> + expected_refcount += compound_nr(page);
> +
> + if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> + pr_err("expected_refcount: %d, refcount: %d\n",
> + expected_refcount, refcount);
> + dump_page(page, "Unexpected refcount");
I see two issues with the pr_err() and the dump_page() call:
1. You probably want to rate limit this, otherwise you'll have a big
problem if lots of pages are pinned!
2. Actually, I don't think you'd want to print anything at all here, even with
rate limiting, because doing so presumes that "unexpected" means "wrong". And I
think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
but that doesn't mean that they're wrong to have.
> + }
> +
> + return page_count(page) == expected_refcount;
> +}
> +
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long address,
> pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> + * Check if the page has any GUP (or other external) pins.
> + *
> + * The page table that maps the page has been already unlinked
> + * from the page table tree and this process cannot get
> + * an additinal pin on the page.
> + *
> + * New pins can come later if the page is shared across fork,
> + * but not for the this process. It is fine. The other process
"but not from this process. The other process"
(It's very hard to figure out what "it is fine" means in this context, so
better to leave it out, imho.)
> + * cannot write to the page, only trigger CoW.
> */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + if (!is_refcount_suitable(page)) {
> unlock_page(page);
> result = SCAN_PAGE_COUNT;
> goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> } else {
> src_page = pte_page(pteval);
> copy_user_highpage(page, src_page, address, vma);
> - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
> release_pte_page(src_page);
> /*
> * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> goto out_unmap;
> }
>
> - /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> - */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + /* Check if the page has any GUP (or other external) pins */
> + if (!is_refcount_suitable(page)) {
> result = SCAN_PAGE_COUNT;
> goto out_unmap;
> }
>
thanks,
--
John Hubbard
NVIDIA
On 2020-04-13 05:52, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Logic to check the refcound is moved to a separate function.
> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.
>
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
> 1 file changed, 30 insertions(+), 11 deletions(-)
Reviewed-by: John Hubbard <[email protected]>
thanks,
--
John Hubbard
NVIDIA
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
> }
> }
>
> +static bool is_refcount_suitable(struct page *page)
> +{
> + int expected_refcount, refcount;
> +
> + refcount = page_count(page);
> + expected_refcount = total_mapcount(page);
> + if (PageSwapCache(page))
> + expected_refcount += compound_nr(page);
> +
> + if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> + pr_err("expected_refcount: %d, refcount: %d\n",
> + expected_refcount, refcount);
> + dump_page(page, "Unexpected refcount");
> + }
> +
> + return page_count(page) == expected_refcount;
> +}
> +
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long address,
> pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> + * Check if the page has any GUP (or other external) pins.
> + *
> + * The page table that maps the page has been already unlinked
> + * from the page table tree and this process cannot get
> + * an additinal pin on the page.
> + *
> + * New pins can come later if the page is shared across fork,
> + * but not for the this process. It is fine. The other process
> + * cannot write to the page, only trigger CoW.
> */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + if (!is_refcount_suitable(page)) {
> unlock_page(page);
> result = SCAN_PAGE_COUNT;
> goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> } else {
> src_page = pte_page(pteval);
> copy_user_highpage(page, src_page, address, vma);
> - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
> release_pte_page(src_page);
> /*
> * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> goto out_unmap;
> }
>
> - /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> - */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + /* Check if the page has any GUP (or other external) pins */
> + if (!is_refcount_suitable(page)) {
> result = SCAN_PAGE_COUNT;
> goto out_unmap;
> }
>
On Mon, Apr 13, 2020 at 01:48:22PM -0700, John Hubbard wrote:
[Thanks for all your suggestions and corrections]
> > + if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> > + pr_err("expected_refcount: %d, refcount: %d\n",
> > + expected_refcount, refcount);
> > + dump_page(page, "Unexpected refcount");
>
>
> I see two issues with the pr_err() and the dump_page() call:
>
> 1. You probably want to rate limit this, otherwise you'll have a big
> problem if lots of pages are pinned!
Nope. Only if kernel is buggy. See below.
> 2. Actually, I don't think you'd want to print anything at all here, even with
> rate limiting, because doing so presumes that "unexpected" means "wrong". And I
> think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
> but that doesn't mean that they're wrong to have.
See condition. We only do it if refcount is *below* expected refcount. It
should never happen. Pinned page would have refcount above expected.
--
Kirill A. Shutemov
On 2020-04-14 14:35, Kirill A. Shutemov wrote:
> On Mon, Apr 13, 2020 at 01:48:22PM -0700, John Hubbard wrote:
>
> [Thanks for all your suggestions and corrections]
>
>>> + if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
>>> + pr_err("expected_refcount: %d, refcount: %d\n",
>>> + expected_refcount, refcount);
>>> + dump_page(page, "Unexpected refcount");
>>
>>
>> I see two issues with the pr_err() and the dump_page() call:
>>
>> 1. You probably want to rate limit this, otherwise you'll have a big
>> problem if lots of pages are pinned!
>
> Nope. Only if kernel is buggy. See below.
>
>> 2. Actually, I don't think you'd want to print anything at all here, even with
>> rate limiting, because doing so presumes that "unexpected" means "wrong". And I
>> think this patch doesn't expect to have GUP pins (or pin_user_pages() pins, ha),
>> but that doesn't mean that they're wrong to have.
>
> See condition. We only do it if refcount is *below* expected refcount. It
> should never happen. Pinned page would have refcount above expected.
>
Yes, you are right. I misread the condition. This actually is just right. :)
thanks,
--
John Hubbard
NVIDIA
On 4/13/20 5:52 AM, Kirill A. Shutemov wrote:
> The page can be included into collapse as long as it doesn't have extra
> pins (from GUP or otherwise).
>
> Logic to check the refcound is moved to a separate function.
s/refcound/refcount
> Note that the function is ready to deal with compound pages. It's
> preparation for the following patch.
>
> VM_BUG_ON_PAGE() was removed from __collapse_huge_page_copy() as the
> invariant it checks is no longer valid: the source can be mapped
> multiple times now.
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> ---
> mm/khugepaged.c | 41 ++++++++++++++++++++++++++++++-----------
> 1 file changed, 30 insertions(+), 11 deletions(-)
Just a minor typo problem.
Acked-by: Yang Shi <[email protected]>
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index e3e41c2768d8..f9864644c3b7 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -529,6 +529,24 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
> }
> }
>
> +static bool is_refcount_suitable(struct page *page)
> +{
> + int expected_refcount, refcount;
> +
> + refcount = page_count(page);
> + expected_refcount = total_mapcount(page);
> + if (PageSwapCache(page))
> + expected_refcount += compound_nr(page);
> +
> + if (IS_ENABLED(CONFIG_DEBUG_VM) && expected_refcount > refcount) {
> + pr_err("expected_refcount: %d, refcount: %d\n",
> + expected_refcount, refcount);
> + dump_page(page, "Unexpected refcount");
> + }
> +
> + return page_count(page) == expected_refcount;
> +}
> +
> static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long address,
> pte_t *pte)
> @@ -581,11 +599,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> + * Check if the page has any GUP (or other external) pins.
> + *
> + * The page table that maps the page has been already unlinked
> + * from the page table tree and this process cannot get
> + * an additinal pin on the page.
> + *
> + * New pins can come later if the page is shared across fork,
> + * but not for the this process. It is fine. The other process
> + * cannot write to the page, only trigger CoW.
> */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + if (!is_refcount_suitable(page)) {
> unlock_page(page);
> result = SCAN_PAGE_COUNT;
> goto out;
> @@ -672,7 +696,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
> } else {
> src_page = pte_page(pteval);
> copy_user_highpage(page, src_page, address, vma);
> - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
> release_pte_page(src_page);
> /*
> * ptl mostly unnecessary, but preempt has to
> @@ -1201,12 +1224,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> goto out_unmap;
> }
>
> - /*
> - * cannot use mapcount: can't collapse if there's a gup pin.
> - * The page must only be referenced by the scanned process
> - * and page swap cache.
> - */
> - if (page_count(page) != 1 + PageSwapCache(page)) {
> + /* Check if the page has any GUP (or other external) pins */
> + if (!is_refcount_suitable(page)) {
> result = SCAN_PAGE_COUNT;
> goto out_unmap;
> }