LinuxLists.cc - [PATCH v1 0/3] Optimize large folio interaction with deferred split

On 17 Jul 2023, at 10:31, Ryan Roberts wrote:

> This allows batching the rmap removal with folio_remove_rmap_range(),
> which means we avoid spuriously adding a partially unmapped folio to the
> deferrred split queue in the common case, which reduces split queue lock
> contention.
>
> Previously each page was removed from the rmap individually with
> page_remove_rmap(). If the first page belonged to a large folio, this
> would cause page_remove_rmap() to conclude that the folio was now
> partially mapped and add the folio to the deferred split queue. But
> subsequent calls would cause the folio to become fully unmapped, meaning
> there is no value to adding it to the split queue.
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> mm/memory.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 119 insertions(+)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 01f39e8144ef..6facb8c8807a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1391,6 +1391,95 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
> pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
> }
>
> +static inline unsigned long page_addr(struct page *page,
> + struct page *anchor, unsigned long anchor_addr)
> +{
> + unsigned long offset;
> + unsigned long addr;
> +
> + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
> + addr = anchor_addr + offset;
> +
> + if (anchor > page) {
> + if (addr > anchor_addr)
> + return 0;
> + } else {
> + if (addr < anchor_addr)
> + return ULONG_MAX;
> + }
> +
> + return addr;
> +}
> +
> +static int calc_anon_folio_map_pgcount(struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end)
> +{
> + pte_t ptent;
> + int floops;
> + int i;
> + unsigned long pfn;
> +
> + end = min(page_addr(&folio->page + folio_nr_pages(folio), page, addr),
> + end);
> + floops = (end - addr) >> PAGE_SHIFT;
> + pfn = page_to_pfn(page);
> + pfn++;
> + pte++;
> +
> + for (i = 1; i < floops; i++) {
> + ptent = ptep_get(pte);
> +
> + if (!pte_present(ptent) ||
> + pte_pfn(ptent) != pfn) {
> + return i;
> + }
> +
> + pfn++;
> + pte++;
> + }
> +
> + return floops;
> +}
> +
> +static unsigned long zap_anon_pte_range(struct mmu_gather *tlb,
> + struct vm_area_struct *vma,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end,
> + bool *full_out)
> +{
> + struct folio *folio = page_folio(page);
> + struct mm_struct *mm = tlb->mm;
> + pte_t ptent;
> + int pgcount;
> + int i;
> + bool full;
> +
> + pgcount = calc_anon_folio_map_pgcount(folio, page, pte, addr, end);
> +
> + for (i = 0; i < pgcount;) {
> + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
> + tlb_remove_tlb_entry(tlb, pte, addr);
> + full = __tlb_remove_page(tlb, page, 0);
> +
> + if (unlikely(page_mapcount(page) < 1))
> + print_bad_pte(vma, addr, ptent, page);
> +
> + i++;
> + page++;
> + pte++;
> + addr += PAGE_SIZE;
> +
> + if (unlikely(full))
> + break;
> + }
> +
> + folio_remove_rmap_range(folio, page - i, i, vma);
> +
> + *full_out = full;
> + return i;
> +}
> +
> static unsigned long zap_pte_range(struct mmu_gather *tlb,
> struct vm_area_struct *vma, pmd_t *pmd,
> unsigned long addr, unsigned long end,
> @@ -1428,6 +1517,36 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
> page = vm_normal_page(vma, addr, ptent);
> if (unlikely(!should_zap_page(details, page)))
> continue;
> +
> + /*
> + * Batch zap large anonymous folio mappings. This allows
> + * batching the rmap removal, which means we avoid
> + * spuriously adding a partially unmapped folio to the
> + * deferrred split queue in the common case, which
> + * reduces split queue lock contention. Require the VMA
> + * to be anonymous to ensure that none of the PTEs in
> + * the range require zap_install_uffd_wp_if_needed().
> + */
> + if (page && PageAnon(page) && vma_is_anonymous(vma)) {
> + bool full;
> + int pgcount;
> +
> + pgcount = zap_anon_pte_range(tlb, vma,
> + page, pte, addr, end, &full);

Are you trying to zap as many ptes as possible if all these ptes are
within a folio? If so, why not calculate end before calling zap_anon_pte_range()?
That would make zap_anon_pte_range() simpler. Also check if page is part of
a large folio first to make sure you can batch.

> +
> + rss[mm_counter(page)] -= pgcount;
> + pgcount--;
> + pte += pgcount;
> + addr += pgcount << PAGE_SHIFT;
> +
> + if (unlikely(full)) {
> + force_flush = 1;
> + addr += PAGE_SIZE;
> + break;
> + }
> + continue;
> + }
> +
> ptent = ptep_get_and_clear_full(mm, addr, pte,
> tlb->fullmm);
> tlb_remove_tlb_entry(tlb, pte, addr);
> --
> 2.25.1

--
Best Regards,
Yan, Zi

Attachments:

signature.asc (871.00 B)
OpenPGP digital signature

2023-07-17 16:26:28

by Ryan Roberts

[permalink] [raw]

Subject: Re: [PATCH v1 2/3] mm: Implement folio_remove_rmap_range()

On 17/07/2023 16:09, Zi Yan wrote:
> On 17 Jul 2023, at 10:31, Ryan Roberts wrote:
>
>> Like page_remove_rmap() but batch-removes the rmap for a range of pages
>> belonging to a folio. This can provide a small speedup due to less
>> manipuation of the various counters. But more crucially, if removing the
>> rmap for all pages of a folio in a batch, there is no need to
>> (spuriously) add it to the deferred split list, which saves significant
>> cost when there is contention for the split queue lock.
>>
>> All contained pages are accounted using the order-0 folio (or base page)
>> scheme.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> include/linux/rmap.h | 2 ++
>> mm/rmap.c | 65 ++++++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 67 insertions(+)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index b87d01660412..f578975c12c0 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -200,6 +200,8 @@ void page_add_file_rmap(struct page *, struct vm_area_struct *,
>> bool compound);
>> void page_remove_rmap(struct page *, struct vm_area_struct *,
>> bool compound);
>> +void folio_remove_rmap_range(struct folio *folio, struct page *page,
>> + int nr, struct vm_area_struct *vma);
>>
>> void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
>> unsigned long address, rmap_t flags);
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 2baf57d65c23..1da05aca2bb1 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1359,6 +1359,71 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>> mlock_vma_folio(folio, vma, compound);
>> }
>>
>> +/*
>> + * folio_remove_rmap_range - take down pte mappings from a range of pages
>> + * belonging to a folio. All pages are accounted as small pages.
>> + * @folio: folio that all pages belong to
>> + * @page: first page in range to remove mapping from
>> + * @nr: number of pages in range to remove mapping from
>
> We might need some checks to make sure [page, page+nr] is in the range of
> the folio. Something like:
>
> page >= &folio->page && page + nr < (&folio->page + folio_nr_pages(folio))

No problem. Is a VM_WARN_ON() appropriate for something like this?

>
>> + * @vma: the vm area from which the mapping is removed
>> + *
>> + * The caller needs to hold the pte lock.
>> + */
>> +void folio_remove_rmap_range(struct folio *folio, struct page *page,
>> + int nr, struct vm_area_struct *vma)
>> +{
>> + atomic_t *mapped = &folio->_nr_pages_mapped;
>> + int nr_unmapped = 0;
>> + int nr_mapped;
>> + bool last;
>> + enum node_stat_item idx;
>> +
>> + if (unlikely(folio_test_hugetlb(folio))) {
>> + VM_WARN_ON_FOLIO(1, folio);
>> + return;
>> + }
>> +
>> + if (!folio_test_large(folio)) {
>> + /* Is this the page's last map to be removed? */
>> + last = atomic_add_negative(-1, &page->_mapcount);
>> + nr_unmapped = last;
>> + } else {
>> + for (; nr != 0; nr--, page++) {
>> + /* Is this the page's last map to be removed? */
>> + last = atomic_add_negative(-1, &page->_mapcount);
>> + if (last) {
>> + /* Page still mapped if folio mapped entirely */
>> + nr_mapped = atomic_dec_return_relaxed(mapped);
>> + if (nr_mapped < COMPOUND_MAPPED)
>> + nr_unmapped++;
>> + }
>> + }
>> + }
>> +
>> + if (nr_unmapped) {
>> + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
>> + __lruvec_stat_mod_folio(folio, idx, -nr_unmapped);
>> +
>> + /*
>> + * Queue anon THP for deferred split if we have just unmapped at
>> + * least 1 page, while at least 1 page remains mapped.
>> + */
>> + if (folio_test_large(folio) && folio_test_anon(folio))
>> + if (nr_mapped)
>> + deferred_split_folio(folio);
>> + }
>> +
>> + /*
>> + * It would be tidy to reset folio_test_anon mapping when fully
>> + * unmapped, but that might overwrite a racing page_add_anon_rmap
>> + * which increments mapcount after us but sets mapping before us:
>> + * so leave the reset to free_pages_prepare, and remember that
>> + * it's only reliable while mapped.
>> + */
>> +
>> + munlock_vma_folio(folio, vma, false);
>> +}
>> +
>> /**
>> * page_remove_rmap - take down pte mapping from a page
>> * @page: page to remove mapping from
>> --
>> 2.25.1
>
> Everything else looks good to me. Reviewed-by: Zi Yan <[email protected]>
>
> --
> Best Regards,
> Yan, Zi

2023-07-17 16:27:56

by Ryan Roberts

[permalink] [raw]

Subject: Re: [PATCH v1 2/3] mm: Implement folio_remove_rmap_range()

On 17/07/2023 16:07, Matthew Wilcox wrote:
> On Mon, Jul 17, 2023 at 03:31:09PM +0100, Ryan Roberts wrote:
>> +/*
>> + * folio_remove_rmap_range - take down pte mappings from a range of pages
>> + * belonging to a folio. All pages are accounted as small pages.
>> + * @folio: folio that all pages belong to
>> + * @page: first page in range to remove mapping from
>> + * @nr: number of pages in range to remove mapping from
>> + * @vma: the vm area from which the mapping is removed
>> + *
>> + * The caller needs to hold the pte lock.
>> + */
>
> This could stand a little reworking. How about this?
>
> /**
> * folio_remove_rmap_range - Take down PTE mappings from a range of pages.
> * @folio: Folio containing all pages in range.
> * @page: First page in range to unmap.
> * @nr: Number of pages to unmap.
> * @vma: The VM area containing the range.
> *
> * All pages in the range must belong to the same VMA & folio. They
> * must be mapped with PTEs, not a PMD.
> *
> * Context: Caller holds the pte lock.
> */

LGTM! thanks.

>
>> +void folio_remove_rmap_range(struct folio *folio, struct page *page,
>> + int nr, struct vm_area_struct *vma)
>> +{
>> + atomic_t *mapped = &folio->_nr_pages_mapped;
>> + int nr_unmapped = 0;
>> + int nr_mapped;
>> + bool last;
>> + enum node_stat_item idx;
>> +
>> + if (unlikely(folio_test_hugetlb(folio))) {
>> + VM_WARN_ON_FOLIO(1, folio);
>> + return;
>> + }
>> +
>> + if (!folio_test_large(folio)) {
>> + /* Is this the page's last map to be removed? */
>> + last = atomic_add_negative(-1, &page->_mapcount);
>> + nr_unmapped = last;
>> + } else {
>> + for (; nr != 0; nr--, page++) {
>> + /* Is this the page's last map to be removed? */
>> + last = atomic_add_negative(-1, &page->_mapcount);
>> + if (last) {
>> + /* Page still mapped if folio mapped entirely */
>> + nr_mapped = atomic_dec_return_relaxed(mapped);
>
> We're still doing one atomic op per page on the folio's nr_pages_mapped
> ... is it possible to batch this and use atomic_sub_return_relaxed()?

Good spot, something like this:

} else {
for (; nr != 0; nr--, page++) {
/* Is this the page's last map to be removed? */
last = atomic_add_negative(-1, &page->_mapcount);
if (last)
nr_unmapped++;
}

/* Pages still mapped if folio mapped entirely */
nr_mapped = atomic_sub_return_relaxed(nr_unmapped, mapped);
if (nr_mapped >= COMPOUND_MAPPED)
nr_unmapped = 0;
}

2023-07-17 16:44:10

by Zi Yan

[permalink] [raw]

Subject: Re: [PATCH v1 2/3] mm: Implement folio_remove_rmap_range()

On 17 Jul 2023, at 11:51, Ryan Roberts wrote:

> On 17/07/2023 16:09, Zi Yan wrote:
>> On 17 Jul 2023, at 10:31, Ryan Roberts wrote:
>>
>>> Like page_remove_rmap() but batch-removes the rmap for a range of pages
>>> belonging to a folio. This can provide a small speedup due to less
>>> manipuation of the various counters. But more crucially, if removing the
>>> rmap for all pages of a folio in a batch, there is no need to
>>> (spuriously) add it to the deferred split list, which saves significant
>>> cost when there is contention for the split queue lock.
>>>
>>> All contained pages are accounted using the order-0 folio (or base page)
>>> scheme.
>>>
>>> Signed-off-by: Ryan Roberts <[email protected]>
>>> ---
>>> include/linux/rmap.h | 2 ++
>>> mm/rmap.c | 65 ++++++++++++++++++++++++++++++++++++++++++++
>>> 2 files changed, 67 insertions(+)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index b87d01660412..f578975c12c0 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -200,6 +200,8 @@ void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>> bool compound);
>>> void page_remove_rmap(struct page *, struct vm_area_struct *,
>>> bool compound);
>>> +void folio_remove_rmap_range(struct folio *folio, struct page *page,
>>> + int nr, struct vm_area_struct *vma);
>>>
>>> void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
>>> unsigned long address, rmap_t flags);
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 2baf57d65c23..1da05aca2bb1 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1359,6 +1359,71 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>> mlock_vma_folio(folio, vma, compound);
>>> }
>>>
>>> +/*
>>> + * folio_remove_rmap_range - take down pte mappings from a range of pages
>>> + * belonging to a folio. All pages are accounted as small pages.
>>> + * @folio: folio that all pages belong to
>>> + * @page: first page in range to remove mapping from
>>> + * @nr: number of pages in range to remove mapping from
>>
>> We might need some checks to make sure [page, page+nr] is in the range of
>> the folio. Something like:
>>
>> page >= &folio->page && page + nr < (&folio->page + folio_nr_pages(folio))
>
> No problem. Is a VM_WARN_ON() appropriate for something like this?

VM_WARN_ON_ONCE() might be better.

--
Best Regards,
Yan, Zi

Attachments:

signature.asc (871.00 B)
OpenPGP digital signature

2023-07-17 16:44:13

On 18/07/2023 00:27, Yin Fengwei wrote:
>
>
> On 7/17/23 22:31, Ryan Roberts wrote:
>> This allows batching the rmap removal with folio_remove_rmap_range(),
>> which means we avoid spuriously adding a partially unmapped folio to the
>> deferrred split queue in the common case, which reduces split queue lock
>> contention.
>>
>> Previously each page was removed from the rmap individually with
>> page_remove_rmap(). If the first page belonged to a large folio, this
>> would cause page_remove_rmap() to conclude that the folio was now
>> partially mapped and add the folio to the deferred split queue. But
>> subsequent calls would cause the folio to become fully unmapped, meaning
>> there is no value to adding it to the split queue.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> mm/memory.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 119 insertions(+)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 01f39e8144ef..6facb8c8807a 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1391,6 +1391,95 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>> pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
>> }
>>
>> +static inline unsigned long page_addr(struct page *page,
>> + struct page *anchor, unsigned long anchor_addr)
>> +{
>> + unsigned long offset;
>> + unsigned long addr;
>> +
>> + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
>> + addr = anchor_addr + offset;
>> +
>> + if (anchor > page) {
>> + if (addr > anchor_addr)
>> + return 0;
>> + } else {
>> + if (addr < anchor_addr)
>> + return ULONG_MAX;
>> + }
>> +
>> + return addr;
>> +}
>> +
>> +static int calc_anon_folio_map_pgcount(struct folio *folio,
>> + struct page *page, pte_t *pte,
>> + unsigned long addr, unsigned long end)
>> +{
>> + pte_t ptent;
>> + int floops;
>> + int i;
>> + unsigned long pfn;
>> +
>> + end = min(page_addr(&folio->page + folio_nr_pages(folio), page, addr),
>> + end);
>> + floops = (end - addr) >> PAGE_SHIFT;
>> + pfn = page_to_pfn(page);
>> + pfn++;
>> + pte++;
>> +
>> + for (i = 1; i < floops; i++) {
>> + ptent = ptep_get(pte);
>> +
>> + if (!pte_present(ptent) ||
>> + pte_pfn(ptent) != pfn) {
>> + return i;
>> + }
>> +
>> + pfn++;
>> + pte++;
>> + }
>> +
>> + return floops;
>> +}
>> +
>> +static unsigned long zap_anon_pte_range(struct mmu_gather *tlb,
>> + struct vm_area_struct *vma,
>> + struct page *page, pte_t *pte,
>> + unsigned long addr, unsigned long end,
>> + bool *full_out)
>> +{
>> + struct folio *folio = page_folio(page);
>> + struct mm_struct *mm = tlb->mm;
>> + pte_t ptent;
>> + int pgcount;
>> + int i;
>> + bool full;
>> +
>> + pgcount = calc_anon_folio_map_pgcount(folio, page, pte, addr, end);
>> +
>> + for (i = 0; i < pgcount;) {
>> + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
>> + tlb_remove_tlb_entry(tlb, pte, addr);
>> + full = __tlb_remove_page(tlb, page, 0);
>> +
>> + if (unlikely(page_mapcount(page) < 1))
>> + print_bad_pte(vma, addr, ptent, page);
>> +
>> + i++;
>> + page++;
>> + pte++;
>> + addr += PAGE_SIZE;
>> +
>> + if (unlikely(full))
>> + break;
>> + }
>> +
>> + folio_remove_rmap_range(folio, page - i, i, vma);
>> +
>> + *full_out = full;
>> + return i;
>> +}
>> +
>> static unsigned long zap_pte_range(struct mmu_gather *tlb,
>> struct vm_area_struct *vma, pmd_t *pmd,
>> unsigned long addr, unsigned long end,
>> @@ -1428,6 +1517,36 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>> page = vm_normal_page(vma, addr, ptent);
>> if (unlikely(!should_zap_page(details, page)))
>> continue;
>> +
>> + /*
>> + * Batch zap large anonymous folio mappings. This allows
>> + * batching the rmap removal, which means we avoid
>> + * spuriously adding a partially unmapped folio to the
>> + * deferrred split queue in the common case, which
>> + * reduces split queue lock contention. Require the VMA
>> + * to be anonymous to ensure that none of the PTEs in
>> + * the range require zap_install_uffd_wp_if_needed().
>> + */
>> + if (page && PageAnon(page) && vma_is_anonymous(vma)) {
> Why this is only for anonymous page? I suppose it can support file mapping also.

I was trying to avoid the complexity. For file-backed pages, there is a bunch of
dirty and access management stuff that needs to happen (see "if
(!PageAnon(page)) {" a bit further down). And for file-backed VMAs (even if the
page is anon, I think?) zap_install_uffd_wp_if_needed() might do some extra
work, which again I didn't want to have to drag into zap_anon_pte_range().

I guess it's implementable, but given only anon folios will be deferred-split
and anon folios in a file-backed vma will all be single page, I didn't feel that
the extra complexity would add anything performance-wise.

>
>
> Regards
> Yin, Fengwei
>
>> + bool full;
>> + int pgcount;
>> +
>> + pgcount = zap_anon_pte_range(tlb, vma,
>> + page, pte, addr, end, &full);
>> +
>> + rss[mm_counter(page)] -= pgcount;
>> + pgcount--;
>> + pte += pgcount;
>> + addr += pgcount << PAGE_SHIFT;
>> +
>> + if (unlikely(full)) {
>> + force_flush = 1;
>> + addr += PAGE_SIZE;
>> + break;
>> + }
>> + continue;
>> + }
>> +
>> ptent = ptep_get_and_clear_full(mm, addr, pte,
>> tlb->fullmm);
>> tlb_remove_tlb_entry(tlb, pte, addr);

2023-07-18 14:35:38

by Zi Yan

[permalink] [raw]

Subject: Re: [PATCH v1 3/3] mm: Batch-zap large anonymous folio PTE mappings

On 18 Jul 2023, at 6:19, Ryan Roberts wrote:

> On 17/07/2023 17:15, Zi Yan wrote:
>> On 17 Jul 2023, at 11:55, Ryan Roberts wrote:
>>
>>> On 17/07/2023 16:25, Zi Yan wrote:
>>>> On 17 Jul 2023, at 10:31, Ryan Roberts wrote:
>>>>
>>>>> This allows batching the rmap removal with folio_remove_rmap_range(),
>>>>> which means we avoid spuriously adding a partially unmapped folio to the
>>>>> deferrred split queue in the common case, which reduces split queue lock
>>>>> contention.
>>>>>
>>>>> Previously each page was removed from the rmap individually with
>>>>> page_remove_rmap(). If the first page belonged to a large folio, this
>>>>> would cause page_remove_rmap() to conclude that the folio was now
>>>>> partially mapped and add the folio to the deferred split queue. But
>>>>> subsequent calls would cause the folio to become fully unmapped, meaning
>>>>> there is no value to adding it to the split queue.
>>>>>
>>>>> Signed-off-by: Ryan Roberts <[email protected]>
>>>>> ---
>>>>> mm/memory.c | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> 1 file changed, 119 insertions(+)
>>>>>
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 01f39e8144ef..6facb8c8807a 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -1391,6 +1391,95 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
>>>>> pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
>>>>> }
>>>>>
>>>>> +static inline unsigned long page_addr(struct page *page,
>>>>> + struct page *anchor, unsigned long anchor_addr)
>>>>> +{
>>>>> + unsigned long offset;
>>>>> + unsigned long addr;
>>>>> +
>>>>> + offset = (page_to_pfn(page) - page_to_pfn(anchor)) << PAGE_SHIFT;
>>>>> + addr = anchor_addr + offset;
>>>>> +
>>>>> + if (anchor > page) {
>>>>> + if (addr > anchor_addr)
>>>>> + return 0;
>>>>> + } else {
>>>>> + if (addr < anchor_addr)
>>>>> + return ULONG_MAX;
>>>>> + }
>>>>> +
>>>>> + return addr;
>>>>> +}
>>>>> +
>>>>> +static int calc_anon_folio_map_pgcount(struct folio *folio,
>>>>> + struct page *page, pte_t *pte,
>>>>> + unsigned long addr, unsigned long end)
>>>>> +{
>>>>> + pte_t ptent;
>>>>> + int floops;
>>>>> + int i;
>>>>> + unsigned long pfn;
>>>>> +
>>>>> + end = min(page_addr(&folio->page + folio_nr_pages(folio), page, addr),
>>>>> + end);
>>>>> + floops = (end - addr) >> PAGE_SHIFT;
>>>>> + pfn = page_to_pfn(page);
>>>>> + pfn++;
>>>>> + pte++;
>>>>> +
>>>>> + for (i = 1; i < floops; i++) {
>>>>> + ptent = ptep_get(pte);
>>>>> +
>>>>> + if (!pte_present(ptent) ||
>>>>> + pte_pfn(ptent) != pfn) {
>>>>> + return i;
>>>>> + }
>>>>> +
>>>>> + pfn++;
>>>>> + pte++;
>>>>> + }
>>>>> +
>>>>> + return floops;
>>>>> +}
>>>>> +
>>>>> +static unsigned long zap_anon_pte_range(struct mmu_gather *tlb,
>>>>> + struct vm_area_struct *vma,
>>>>> + struct page *page, pte_t *pte,
>>>>> + unsigned long addr, unsigned long end,
>>>>> + bool *full_out)
>>>>> +{
>>>>> + struct folio *folio = page_folio(page);
>>>>> + struct mm_struct *mm = tlb->mm;
>>>>> + pte_t ptent;
>>>>> + int pgcount;
>>>>> + int i;
>>>>> + bool full;
>>>>> +
>>>>> + pgcount = calc_anon_folio_map_pgcount(folio, page, pte, addr, end);
>>>>> +
>>>>> + for (i = 0; i < pgcount;) {
>>>>> + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
>>>>> + tlb_remove_tlb_entry(tlb, pte, addr);
>>>>> + full = __tlb_remove_page(tlb, page, 0);
>>>>> +
>>>>> + if (unlikely(page_mapcount(page) < 1))
>>>>> + print_bad_pte(vma, addr, ptent, page);
>>>>> +
>>>>> + i++;
>>>>> + page++;
>>>>> + pte++;
>>>>> + addr += PAGE_SIZE;
>>>>> +
>>>>> + if (unlikely(full))
>>>>> + break;
>>>>> + }
>>>>> +
>>>>> + folio_remove_rmap_range(folio, page - i, i, vma);
>>>>> +
>>>>> + *full_out = full;
>>>>> + return i;
>>>>> +}
>>>>> +
>>>>> static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>>> struct vm_area_struct *vma, pmd_t *pmd,
>>>>> unsigned long addr, unsigned long end,
>>>>> @@ -1428,6 +1517,36 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
>>>>> page = vm_normal_page(vma, addr, ptent);
>>>>> if (unlikely(!should_zap_page(details, page)))
>>>>> continue;
>>>>> +
>>>>> + /*
>>>>> + * Batch zap large anonymous folio mappings. This allows
>>>>> + * batching the rmap removal, which means we avoid
>>>>> + * spuriously adding a partially unmapped folio to the
>>>>> + * deferrred split queue in the common case, which
>>>>> + * reduces split queue lock contention. Require the VMA
>>>>> + * to be anonymous to ensure that none of the PTEs in
>>>>> + * the range require zap_install_uffd_wp_if_needed().
>>>>> + */
>>>>> + if (page && PageAnon(page) && vma_is_anonymous(vma)) {
>>>>> + bool full;
>>>>> + int pgcount;
>>>>> +
>>>>> + pgcount = zap_anon_pte_range(tlb, vma,
>>>>> + page, pte, addr, end, &full);
>>>>
>>>> Are you trying to zap as many ptes as possible if all these ptes are
>>>> within a folio?
>>>
>>> Yes.
>>>
>>>> If so, why not calculate end before calling zap_anon_pte_range()?
>>>> That would make zap_anon_pte_range() simpler.
>>>
>>> I'm not sure I follow. That's currently done in calc_anon_folio_map_pgcount(). I
>>> could move it to here, but I'm not sure that makes things simpler, just puts
>>> more code in here and less in there?
>>
>> Otherwise your zap_anon_pte_range() is really zap_anon_pte_in_folio_range() or
>> some other more descriptive name. When I first look at the name, I thought
>> PTEs will be zapped until the end. But that is not the case when I look at the
>> code. And future users can easily be confused too and use it in a wrong way.
>
> OK I see your point. OK let me pull the page count calculation into here and
> pass it to zap_anon_pte_range(). Then I think we can keep the name as is?

Yes. Thanks.

>
>
>>
>> BTW, page_addr() needs a better name and is easily confused with existing
>> page_address().
>
> Yeah... I'll try to think of something for v2.
>
>>
>>>
>>>> Also check if page is part of
>>>> a large folio first to make sure you can batch.
>>>
>>> Yeah that's fair. I'd be inclined to put that in zap_anon_pte_range() to short
>>> circuit calc_anon_folio_map_pgcount(). But ultimately zap_anon_pte_range() would
>>> still zap the single pte.
>>>
>>>
>>>>
>>>>> +
>>>>> + rss[mm_counter(page)] -= pgcount;
>>>>> + pgcount--;
>>>>> + pte += pgcount;
>>>>> + addr += pgcount << PAGE_SHIFT;
>>>>> +
>>>>> + if (unlikely(full)) {
>>>>> + force_flush = 1;
>>>>> + addr += PAGE_SIZE;
>>>>> + break;
>>>>> + }
>>>>> + continue;
>>>>> + }
>>>>> +
>>>>> ptent = ptep_get_and_clear_full(mm, addr, pte,
>>>>> tlb->fullmm);
>>>>> tlb_remove_tlb_entry(tlb, pte, addr);
>>>>> --
>>>>> 2.25.1
>>>>
>>>>
>>>> --
>>>> Best Regards,
>>>> Yan, Zi
>>
>>
>> --
>> Best Regards,
>> Yan, Zi

--
Best Regards,
Yan, Zi

Attachments:

signature.asc (871.00 B)
OpenPGP digital signature