LinuxLists.cc - [RFC PATCH 5/5] mm proc/task_mmu.c: add hugetlb specific routine for clear

2021-02-11 00:08:44

Subject: [RFC PATCH 5/5] mm proc/task_mmu.c: add hugetlb specific routine for clear_refs

There was is no hugetlb specific routine for clearing soft dirty and
other referrences. The 'default' routines would only clear the
VM_SOFTDIRTY flag in the vma.

Add new routine specifically for hugetlb vmas.

Signed-off-by: Mike Kravetz <[email protected]>
---
fs/proc/task_mmu.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 110 insertions(+)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 829b35016aaa..f06cf9b131a8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1116,6 +1116,115 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
}
#endif

+#ifdef CONFIG_HUGETLB_PAGE
+static inline bool huge_pte_is_pinned(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
+ return false;
+ page = pte_page(pte);
+ if (!page)
+ return false;
+ return page_maybe_dma_pinned(page);
+}
+
+static int clear_refs_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(walk->vma);
+ unsigned long adj_start = addr, adj_end = end;
+ spinlock_t *ptl;
+ pte_t old_pte, pte;
+
+ /*
+ * clear_refs should only operate on complete vmas. Therefore,
+ * values passed here should be huge page aligned and huge page
+ * size in length. Quick validation before taking any action in
+ * case upstream code is changed.
+ */
+ if ((addr & hmask) != addr || end - addr != huge_page_size(h)) {
+ WARN_ONCE(1, "%s passed unaligned address\n", __func__);
+ return 1;
+ }
+
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
+
+ /* Soft dirty and pmd sharing do not mix */
+
+ pte = huge_ptep_get(ptep);
+ if (!pte_present(pte))
+ goto out;
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ goto out;
+
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ if (huge_pte_is_pinned(vma, addr, pte))
+ goto out;
+
+ /*
+ * soft dirty and pmd sharing do not work together as
+ * per-process is tracked in ptes, and pmd sharing allows
+ * processed to share ptes. We unshare any pmds here.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &adj_start, &adj_end);
+ flush_cache_range(vma, adj_start, adj_end);
+ /*
+ * Only atttempt unshare if sharing possible. If we unshare,
+ * then pte's for a PUD sized area are effectively removed for
+ * this process. That clears soft dirty.
+ */
+ if (adj_start != addr || adj_end != end) {
+ struct mmu_notifier_range range;
+ int unshared;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR,
+ 0, vma, vma->vm_mm, adj_start, adj_end);
+ mmu_notifier_invalidate_range_start(&range);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+ unshared = huge_pmd_unshare(vma->vm_mm, vma,
+ &addr, ptep);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ mmu_notifier_invalidate_range_end(&range);
+ if (unshared)
+ goto unshare_done;
+ }
+
+ if (is_hugetlb_entry_migration(pte)) {
+ pte = huge_pte_swp_clear_soft_dirty(pte);
+ set_huge_pte_at(walk->mm, addr, ptep, pte);
+ } else {
+ old_pte = huge_ptep_modify_prot_start(vma, addr, ptep);
+ pte = pte_mkhuge(huge_pte_wrprotect(pte));
+ pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = huge_pte_clear_soft_dirty(pte);
+ pte = huge_pte_mkyoung(pte);
+ huge_ptep_modify_prot_commit(vma, addr, ptep,
+ old_pte, pte);
+ }
+
+unshare_done:
+ flush_hugetlb_tlb_range(vma, addr, end);
+ }
+
+ /* reference bits in hugetlb pages are not reset/used */
+out:
+ spin_unlock(ptl);
+ return 0;
+}
+#else /* CONFIG_HUGETLB_PAGE */
+static int clear_refs_hugetlb_range(pte_t *ptep, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ return 1;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -1198,6 +1307,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
}

static const struct mm_walk_ops clear_refs_walk_ops = {
+ .hugetlb_entry = clear_refs_hugetlb_range,
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
};
--
2.29.2

2021-02-17 21:25:07

by Peter Xu

[permalink] [raw]

Subject: Re: [RFC PATCH 5/5] mm proc/task_mmu.c: add hugetlb specific routine for clear_refs

On Wed, Feb 10, 2021 at 04:03:22PM -0800, Mike Kravetz wrote:
> There was is no hugetlb specific routine for clearing soft dirty and
> other referrences. The 'default' routines would only clear the
> VM_SOFTDIRTY flag in the vma.
>
> Add new routine specifically for hugetlb vmas.
>
> Signed-off-by: Mike Kravetz <[email protected]>
> ---
> fs/proc/task_mmu.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 110 insertions(+)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 829b35016aaa..f06cf9b131a8 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1116,6 +1116,115 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
> }
> #endif
>
> +#ifdef CONFIG_HUGETLB_PAGE
> +static inline bool huge_pte_is_pinned(struct vm_area_struct *vma,
> + unsigned long addr, pte_t pte)
> +{
> + struct page *page;
> +
> + if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
> + return false;
> + page = pte_page(pte);
> + if (!page)
> + return false;
> + return page_maybe_dma_pinned(page);
> +}
> +
> +static int clear_refs_hugetlb_range(pte_t *ptep, unsigned long hmask,
> + unsigned long addr, unsigned long end,
> + struct mm_walk *walk)
> +{
> + struct clear_refs_private *cp = walk->private;
> + struct vm_area_struct *vma = walk->vma;
> + struct hstate *h = hstate_vma(walk->vma);
> + unsigned long adj_start = addr, adj_end = end;
> + spinlock_t *ptl;
> + pte_t old_pte, pte;
> +
> + /*
> + * clear_refs should only operate on complete vmas. Therefore,
> + * values passed here should be huge page aligned and huge page
> + * size in length. Quick validation before taking any action in
> + * case upstream code is changed.
> + */
> + if ((addr & hmask) != addr || end - addr != huge_page_size(h)) {
> + WARN_ONCE(1, "%s passed unaligned address\n", __func__);
> + return 1;
> + }

I wouldn't worry too much on the interface change - The one who will change the
interface should guarantee all existing hooks will still work, isn't it? :)

It's slightly confusing to me on why "clear_refs should only operate on
complete vmas" is related to the check, though.

> +
> + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
> +
> + /* Soft dirty and pmd sharing do not mix */

Right, this seems to be a placeholder for unsharing code.

Though maybe we can do that earlier in pre_vma() hook? That should be per-vma
rather than handling one specific huge page here, hence more efficient imho.

this reminded me that I should also better move hugetlb_unshare_all_pmds() of
my other patch into hugetlb.c, so that this code can call it. Currently it's a
static function in userfaultfd.c.

> +
> + pte = huge_ptep_get(ptep);
> + if (!pte_present(pte))
> + goto out;
> + if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
> + goto out;
> +
> + if (cp->type == CLEAR_REFS_SOFT_DIRTY) {

Maybe move this check into clear_refs_test_walk()? We can bail out earlier if:

(is_vm_hugetlb_page(vma) && (type != CLEAR_REFS_SOFT_DIRTY))

> + if (huge_pte_is_pinned(vma, addr, pte))
> + goto out;

Out of topic of this patchset, but it's definitely a pity that we can't track
soft dirty for pinned pages. Currently the assumption of the pte code path is:
"if this page can be DMA written then we won't know whether data changed after
all, then tracking dirty is meaningless", however that's prone to change when
new hardwares coming, say, IOMMU could start to trap DMA writes already.

But again that's another story.. and we should just follow what we do with
non-hugetlbfs for sure here, until some day if we'd like to revive soft dirty
tracking with pinned pages.

> +
> + /*
> + * soft dirty and pmd sharing do not work together as
> + * per-process is tracked in ptes, and pmd sharing allows
> + * processed to share ptes. We unshare any pmds here.
> + */
> + adjust_range_if_pmd_sharing_possible(vma, &adj_start, &adj_end);

Ideally when reach here, huge pmd sharing won't ever exist, right? Then do we
still need to adjust the range at all?

Thanks,

--
Peter Xu

2021-02-19 00:16:29

by Mike Kravetz

[permalink] [raw]

Subject: Re: [RFC PATCH 5/5] mm proc/task_mmu.c: add hugetlb specific routine for clear_refs

On 2/17/21 12:25 PM, Peter Xu wrote:
> On Wed, Feb 10, 2021 at 04:03:22PM -0800, Mike Kravetz wrote:
>> There was is no hugetlb specific routine for clearing soft dirty and
>> other referrences. The 'default' routines would only clear the
>> VM_SOFTDIRTY flag in the vma.
>>
>> Add new routine specifically for hugetlb vmas.
>>
>> Signed-off-by: Mike Kravetz <[email protected]>
>> ---
>> fs/proc/task_mmu.c | 110 +++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 110 insertions(+)
>>
>> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
>> index 829b35016aaa..f06cf9b131a8 100644
>> --- a/fs/proc/task_mmu.c
>> +++ b/fs/proc/task_mmu.c
>> @@ -1116,6 +1116,115 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
>> }
>> #endif
>>
>> +#ifdef CONFIG_HUGETLB_PAGE
>> +static inline bool huge_pte_is_pinned(struct vm_area_struct *vma,
>> + unsigned long addr, pte_t pte)
>> +{
>> + struct page *page;
>> +
>> + if (likely(!atomic_read(&vma->vm_mm->has_pinned)))
>> + return false;
>> + page = pte_page(pte);
>> + if (!page)
>> + return false;
>> + return page_maybe_dma_pinned(page);
>> +}
>> +
>> +static int clear_refs_hugetlb_range(pte_t *ptep, unsigned long hmask,
>> + unsigned long addr, unsigned long end,
>> + struct mm_walk *walk)
>> +{
>> + struct clear_refs_private *cp = walk->private;
>> + struct vm_area_struct *vma = walk->vma;
>> + struct hstate *h = hstate_vma(walk->vma);
>> + unsigned long adj_start = addr, adj_end = end;
>> + spinlock_t *ptl;
>> + pte_t old_pte, pte;
>> +
>> + /*
>> + * clear_refs should only operate on complete vmas. Therefore,
>> + * values passed here should be huge page aligned and huge page
>> + * size in length. Quick validation before taking any action in
>> + * case upstream code is changed.
>> + */
>> + if ((addr & hmask) != addr || end - addr != huge_page_size(h)) {
>> + WARN_ONCE(1, "%s passed unaligned address\n", __func__);
>> + return 1;
>> + }
>
> I wouldn't worry too much on the interface change - The one who will change the
> interface should guarantee all existing hooks will still work, isn't it? :)

Yeah, I can drop this.

> It's slightly confusing to me on why "clear_refs should only operate on
> complete vmas" is related to the check, though.

Mostly me thinking that since it is operating on complete (hugetlb) vma,
then we know vms is huge page aligned and a multiple of huge page in size.
So, all passed addressed should be huge page aligned as well.

>
>> +
>> + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
>> +
>> + /* Soft dirty and pmd sharing do not mix */
>
> Right, this seems to be a placeholder for unsharing code.

Sorry, comment was left over from earlier code. Unsharing is actually
done below, I forgot to remove comment.

> Though maybe we can do that earlier in pre_vma() hook? That should be per-vma
> rather than handling one specific huge page here, hence more efficient imho.

Yes, let me look into that. The code below is certianly not the most
efficient.

> this reminded me that I should also better move hugetlb_unshare_all_pmds() of
> my other patch into hugetlb.c, so that this code can call it. Currently it's a
> static function in userfaultfd.c.
>
>> +
>> + pte = huge_ptep_get(ptep);
>> + if (!pte_present(pte))
>> + goto out;
>> + if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
>> + goto out;
>> +
>> + if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
>
> Maybe move this check into clear_refs_test_walk()? We can bail out earlier if:
>
> (is_vm_hugetlb_page(vma) && (type != CLEAR_REFS_SOFT_DIRTY))
>

Yes, we can do that. I was patterning this after the other 'clear_refs'
routines. But, they can clear things besides soft dirty. Since soft
dirty is the only thing handled for hugetlb, we can bail earlier.

>> + if (huge_pte_is_pinned(vma, addr, pte))
>> + goto out;
>
> Out of topic of this patchset, but it's definitely a pity that we can't track
> soft dirty for pinned pages. Currently the assumption of the pte code path is:
> "if this page can be DMA written then we won't know whether data changed after
> all, then tracking dirty is meaningless", however that's prone to change when
> new hardwares coming, say, IOMMU could start to trap DMA writes already.
>
> But again that's another story.. and we should just follow what we do with
> non-hugetlbfs for sure here, until some day if we'd like to revive soft dirty
> tracking with pinned pages.
>
>> +
>> + /*
>> + * soft dirty and pmd sharing do not work together as
>> + * per-process is tracked in ptes, and pmd sharing allows
>> + * processed to share ptes. We unshare any pmds here.
>> + */
>> + adjust_range_if_pmd_sharing_possible(vma, &adj_start, &adj_end);
>
> Ideally when reach here, huge pmd sharing won't ever exist, right? Then do we
> still need to adjust the range at all?

Right, we should be able to do it earlier.

Thanks again for taking a look at this.
--
Mike Kravetz

>
> Thanks,
>