LinuxLists.cc - [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

2015-05-07 07:23:40

Subject: [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

After this patch pmdp_* functions operate only on hugepage pte,
and not on regular pmd_t values pointing to page table.

Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
arch/powerpc/include/asm/pgtable-ppc64.h | 4 ++
arch/powerpc/mm/pgtable_64.c | 76 +++++++++++++++++---------------
include/asm-generic/pgtable.h | 19 ++++++++
mm/huge_memory.c | 2 +-
4 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 43e6ad424c7f..50830c9a2116 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
extern void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);

+#define __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
#define __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 59daa5eeec25..9171c1a37290 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -560,41 +560,47 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
pmd_t pmd;

VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- if (pmd_trans_huge(*pmdp)) {
- pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
- } else {
- /*
- * khugepaged calls this for normal pmd
- */
- pmd = *pmdp;
- pmd_clear(pmdp);
- /*
- * Wait for all pending hash_page to finish. This is needed
- * in case of subpage collapse. When we collapse normal pages
- * to hugepage, we first clear the pmd, then invalidate all
- * the PTE entries. The assumption here is that any low level
- * page fault will see a none pmd and take the slow path that
- * will wait on mmap_sem. But we could very well be in a
- * hash_page with local ptep pointer value. Such a hash page
- * can result in adding new HPTE entries for normal subpages.
- * That means we could be modifying the page content as we
- * copy them to a huge page. So wait for parallel hash_page
- * to finish before invalidating HPTE entries. We can do this
- * by sending an IPI to all the cpus and executing a dummy
- * function there.
- */
- kick_all_cpus_sync();
- /*
- * Now invalidate the hpte entries in the range
- * covered by pmd. This make sure we take a
- * fault and will find the pmd as none, which will
- * result in a major fault which takes mmap_sem and
- * hence wait for collapse to complete. Without this
- * the __collapse_huge_page_copy can result in copying
- * the old content.
- */
- flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
- }
+ VM_BUG_ON(!pmd_trans_huge(*pmdp));
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ return pmd;
+}
+
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+
+ pmd = *pmdp;
+ pmd_clear(pmdp);
+ /*
+ * Wait for all pending hash_page to finish. This is needed
+ * in case of subpage collapse. When we collapse normal pages
+ * to hugepage, we first clear the pmd, then invalidate all
+ * the PTE entries. The assumption here is that any low level
+ * page fault will see a none pmd and take the slow path that
+ * will wait on mmap_sem. But we could very well be in a
+ * hash_page with local ptep pointer value. Such a hash page
+ * can result in adding new HPTE entries for normal subpages.
+ * That means we could be modifying the page content as we
+ * copy them to a huge page. So wait for parallel hash_page
+ * to finish before invalidating HPTE entries. We can do this
+ * by sending an IPI to all the cpus and executing a dummy
+ * function there.
+ */
+ kick_all_cpus_sync();
+ /*
+ * Now invalidate the hpte entries in the range
+ * covered by pmd. This make sure we take a
+ * fault and will find the pmd as none, which will
+ * result in a major fault which takes mmap_sem and
+ * hence wait for collapse to complete. Without this
+ * the __collapse_huge_page_copy can result in copying
+ * the old content.
+ */
+ flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
return pmd;
}

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 39f1d6a2b04d..80e6d415cd57 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -189,6 +189,25 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#endif

+#ifndef __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ return pmdp_clear_flush(vma, address, pmdp);
+}
+#else
+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmdp)
+{
+ BUILD_BUG();
+ return __pmd(0);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf3636..88f695a4e38b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

--
2.1.4

2015-05-07 07:23:46

by Aneesh Kumar K.V

[permalink] [raw]

Subject: [PATCH V2 2/2] powerpc/thp: Serialize pmd clear against a linux page table walk.

Serialize against find_linux_pte_or_hugepte which does lock-less
lookup in page tables with local interrupts disabled. For huge pages
it casts pmd_t to pte_t. Since format of pte_t is different from
pmd_t we want to prevent transit from pmd pointing to page table
to pmd pointing to huge page (and back) while interrupts are disabled.
We clear pmd to possibly replace it with page table pointer in
different code paths. So make sure we wait for the parallel
find_linux_pte_or_hugepage to finish.

Reported-by: Kirill A. Shutemov <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
---
Changes from v1:
* Move kick_all_cpus_sync to pmdp_get_and_clear so that it handle zap_huge_pmd
case also.

arch/powerpc/mm/pgtable_64.c | 11 +++++++++++
1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9171c1a37290..049d961802aa 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -845,6 +845,17 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
* hash fault look at them.
*/
memset(pgtable, 0, PTE_FRAG_SIZE);
+ /*
+ * Serialize against find_linux_pte_or_hugepte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_linux_pte_or_hugepage to finish.
+ */
+ kick_all_cpus_sync();
return old_pmd;
}

--
2.1.4

2015-05-07 09:20:43

by Kirill A. Shutemov

[permalink] [raw]

Subject: Re: [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

On Thu, May 07, 2015 at 12:53:27PM +0530, Aneesh Kumar K.V wrote:
> After this patch pmdp_* functions operate only on hugepage pte,
> and not on regular pmd_t values pointing to page table.
>
> Signed-off-by: Aneesh Kumar K.V <[email protected]>
> ---
> arch/powerpc/include/asm/pgtable-ppc64.h | 4 ++
> arch/powerpc/mm/pgtable_64.c | 76 +++++++++++++++++---------------
> include/asm-generic/pgtable.h | 19 ++++++++
> mm/huge_memory.c | 2 +-
> 4 files changed, 65 insertions(+), 36 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
> index 43e6ad424c7f..50830c9a2116 100644
> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
> extern void pmdp_splitting_flush(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp);
>
> +#define __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
> +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp);
> +
> #define __HAVE_ARCH_PGTABLE_DEPOSIT
> extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> pgtable_t pgtable);
> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> index 59daa5eeec25..9171c1a37290 100644
> --- a/arch/powerpc/mm/pgtable_64.c
> +++ b/arch/powerpc/mm/pgtable_64.c
> @@ -560,41 +560,47 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
> pmd_t pmd;
>
> VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> - if (pmd_trans_huge(*pmdp)) {
> - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
> - } else {
> - /*
> - * khugepaged calls this for normal pmd
> - */
> - pmd = *pmdp;
> - pmd_clear(pmdp);
> - /*
> - * Wait for all pending hash_page to finish. This is needed
> - * in case of subpage collapse. When we collapse normal pages
> - * to hugepage, we first clear the pmd, then invalidate all
> - * the PTE entries. The assumption here is that any low level
> - * page fault will see a none pmd and take the slow path that
> - * will wait on mmap_sem. But we could very well be in a
> - * hash_page with local ptep pointer value. Such a hash page
> - * can result in adding new HPTE entries for normal subpages.
> - * That means we could be modifying the page content as we
> - * copy them to a huge page. So wait for parallel hash_page
> - * to finish before invalidating HPTE entries. We can do this
> - * by sending an IPI to all the cpus and executing a dummy
> - * function there.
> - */
> - kick_all_cpus_sync();
> - /*
> - * Now invalidate the hpte entries in the range
> - * covered by pmd. This make sure we take a
> - * fault and will find the pmd as none, which will
> - * result in a major fault which takes mmap_sem and
> - * hence wait for collapse to complete. Without this
> - * the __collapse_huge_page_copy can result in copying
> - * the old content.
> - */
> - flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
> - }
> + VM_BUG_ON(!pmd_trans_huge(*pmdp));
> + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
> + return pmd;

The patches are in reverse order: you need to change pmdp_get_and_clear
first otherwise you break bisectability.
Or better merge patches together.

> +}
> +
> +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
> + pmd_t *pmdp)
> +{
> + pmd_t pmd;
> +
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> + VM_BUG_ON(pmd_trans_huge(*pmdp));
> +
> + pmd = *pmdp;
> + pmd_clear(pmdp);
> + /*
> + * Wait for all pending hash_page to finish. This is needed
> + * in case of subpage collapse. When we collapse normal pages
> + * to hugepage, we first clear the pmd, then invalidate all
> + * the PTE entries. The assumption here is that any low level
> + * page fault will see a none pmd and take the slow path that
> + * will wait on mmap_sem. But we could very well be in a
> + * hash_page with local ptep pointer value. Such a hash page
> + * can result in adding new HPTE entries for normal subpages.
> + * That means we could be modifying the page content as we
> + * copy them to a huge page. So wait for parallel hash_page
> + * to finish before invalidating HPTE entries. We can do this
> + * by sending an IPI to all the cpus and executing a dummy
> + * function there.
> + */
> + kick_all_cpus_sync();
> + /*
> + * Now invalidate the hpte entries in the range
> + * covered by pmd. This make sure we take a
> + * fault and will find the pmd as none, which will
> + * result in a major fault which takes mmap_sem and
> + * hence wait for collapse to complete. Without this
> + * the __collapse_huge_page_copy can result in copying
> + * the old content.
> + */
> + flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
> return pmd;
> }
>
> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
> index 39f1d6a2b04d..80e6d415cd57 100644
> --- a/include/asm-generic/pgtable.h
> +++ b/include/asm-generic/pgtable.h
> @@ -189,6 +189,25 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp);
> #endif
>
> +#ifndef __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> + unsigned long address,
> + pmd_t *pmdp)
> +{
> + return pmdp_clear_flush(vma, address, pmdp);
> +}
> +#else
> +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> + unsigned long address,
> + pmd_t *pmdp)
> +{
> + BUILD_BUG();
> + return __pmd(0);
> +}
> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
> +#endif
> +
> #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
> extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
> pgtable_t pgtable);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 078832cf3636..88f695a4e38b 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
> * huge and small TLB entries for the same virtual address
> * to avoid the risk of CPU bugs in that area.
> */
> - _pmd = pmdp_clear_flush(vma, address, pmd);
> + _pmd = pmdp_collapse_flush(vma, address, pmd);

Why? pmdp_clear_flush() does kick_all_cpus_sync() already.

> spin_unlock(pmd_ptl);
> mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
>
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Kirill A. Shutemov

2015-05-07 11:19:13

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

"Kirill A. Shutemov" <[email protected]> writes:

> On Thu, May 07, 2015 at 12:53:27PM +0530, Aneesh Kumar K.V wrote:
>> After this patch pmdp_* functions operate only on hugepage pte,
>> and not on regular pmd_t values pointing to page table.
>>
>> Signed-off-by: Aneesh Kumar K.V <[email protected]>
>> ---
>> arch/powerpc/include/asm/pgtable-ppc64.h | 4 ++
>> arch/powerpc/mm/pgtable_64.c | 76 +++++++++++++++++---------------
>> include/asm-generic/pgtable.h | 19 ++++++++
>> mm/huge_memory.c | 2 +-
>> 4 files changed, 65 insertions(+), 36 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
>> index 43e6ad424c7f..50830c9a2116 100644
>> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
>> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
>> @@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>> extern void pmdp_splitting_flush(struct vm_area_struct *vma,
>> unsigned long address, pmd_t *pmdp);
>>
>> +#define __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
>> +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> + unsigned long address, pmd_t *pmdp);
>> +
>> #define __HAVE_ARCH_PGTABLE_DEPOSIT
>> extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>> pgtable_t pgtable);
>> diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
>> index 59daa5eeec25..9171c1a37290 100644
>> --- a/arch/powerpc/mm/pgtable_64.c
>> +++ b/arch/powerpc/mm/pgtable_64.c
>> @@ -560,41 +560,47 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
>> pmd_t pmd;
>>
>> VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> - if (pmd_trans_huge(*pmdp)) {
>> - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
>> - } else {
>> - /*
>> - * khugepaged calls this for normal pmd
>> - */
>> - pmd = *pmdp;
>> - pmd_clear(pmdp);
>> - /*
>> - * Wait for all pending hash_page to finish. This is needed
>> - * in case of subpage collapse. When we collapse normal pages
>> - * to hugepage, we first clear the pmd, then invalidate all
>> - * the PTE entries. The assumption here is that any low level
>> - * page fault will see a none pmd and take the slow path that
>> - * will wait on mmap_sem. But we could very well be in a
>> - * hash_page with local ptep pointer value. Such a hash page
>> - * can result in adding new HPTE entries for normal subpages.
>> - * That means we could be modifying the page content as we
>> - * copy them to a huge page. So wait for parallel hash_page
>> - * to finish before invalidating HPTE entries. We can do this
>> - * by sending an IPI to all the cpus and executing a dummy
>> - * function there.
>> - */
>> - kick_all_cpus_sync();
>> - /*
>> - * Now invalidate the hpte entries in the range
>> - * covered by pmd. This make sure we take a
>> - * fault and will find the pmd as none, which will
>> - * result in a major fault which takes mmap_sem and
>> - * hence wait for collapse to complete. Without this
>> - * the __collapse_huge_page_copy can result in copying
>> - * the old content.
>> - */
>> - flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
>> - }
>> + VM_BUG_ON(!pmd_trans_huge(*pmdp));
>> + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
>> + return pmd;
>
> The patches are in reverse order: you need to change pmdp_get_and_clear
> first otherwise you break bisectability.
> Or better merge patches together.

The first patch is really a cleanup and should not result in code
changes. It just make sure that we use pmdp_* functions only on hugepage
ptes and not on regular pmd_t pointers to pgtable. It avoid the not so
nice if (pmd_trans_huge()) check in the code and allows us to do the
VM_BUG_ON(!pmd_trans_huge(*pmdp)) there. That is really important on
archs like ppc64 where regular pmd format is different from hugepage pte
format.

>
>> +}
>> +
>> +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
>> + pmd_t *pmdp)
>> +{
>> + pmd_t pmd;
>> +
>> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>> + VM_BUG_ON(pmd_trans_huge(*pmdp));
>> +
>> + pmd = *pmdp;
>> + pmd_clear(pmdp);
>> + /*
>> + * Wait for all pending hash_page to finish. This is needed
>> + * in case of subpage collapse. When we collapse normal pages
>> + * to hugepage, we first clear the pmd, then invalidate all
>> + * the PTE entries. The assumption here is that any low level
>> + * page fault will see a none pmd and take the slow path that
>> + * will wait on mmap_sem. But we could very well be in a
>> + * hash_page with local ptep pointer value. Such a hash page
>> + * can result in adding new HPTE entries for normal subpages.
>> + * That means we could be modifying the page content as we
>> + * copy them to a huge page. So wait for parallel hash_page
>> + * to finish before invalidating HPTE entries. We can do this
>> + * by sending an IPI to all the cpus and executing a dummy
>> + * function there.
>> + */
>> + kick_all_cpus_sync();
>> + /*
>> + * Now invalidate the hpte entries in the range
>> + * covered by pmd. This make sure we take a
>> + * fault and will find the pmd as none, which will
>> + * result in a major fault which takes mmap_sem and
>> + * hence wait for collapse to complete. Without this
>> + * the __collapse_huge_page_copy can result in copying
>> + * the old content.
>> + */
>> + flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
>> return pmd;
>> }
>>
>> diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
>> index 39f1d6a2b04d..80e6d415cd57 100644
>> --- a/include/asm-generic/pgtable.h
>> +++ b/include/asm-generic/pgtable.h
>> @@ -189,6 +189,25 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
>> unsigned long address, pmd_t *pmdp);
>> #endif
>>
>> +#ifndef __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> + unsigned long address,
>> + pmd_t *pmdp)
>> +{
>> + return pmdp_clear_flush(vma, address, pmdp);
>> +}
>> +#else
>> +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> + unsigned long address,
>> + pmd_t *pmdp)
>> +{
>> + BUILD_BUG();
>> + return __pmd(0);
>> +}
>> +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>> +#endif
>> +
>> #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
>> extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
>> pgtable_t pgtable);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 078832cf3636..88f695a4e38b 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>> * huge and small TLB entries for the same virtual address
>> * to avoid the risk of CPU bugs in that area.
>> */
>> - _pmd = pmdp_clear_flush(vma, address, pmd);
>> + _pmd = pmdp_collapse_flush(vma, address, pmd);
>
> Why? pmdp_clear_flush() does kick_all_cpus_sync() already.

Here we are clearing the regular pmd_t and for ppc64 that means we need
to invalidate all the normal page pte mappings we already have inserted
in the hardware hash page table. But before doing that we need to make
sure there are no parallel hash page table insert going on. So we need
to do a kick_all_cpus_sync() before flushing the older hash table
entries. By moving this to a seperate function we capture these details
nicely.

-aneesh

2015-05-08 22:21:52

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH V2 2/2] powerpc/thp: Serialize pmd clear against a linux page table walk.

On Thu, 7 May 2015 12:53:28 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> Serialize against find_linux_pte_or_hugepte which does lock-less
> lookup in page tables with local interrupts disabled. For huge pages
> it casts pmd_t to pte_t. Since format of pte_t is different from
> pmd_t we want to prevent transit from pmd pointing to page table
> to pmd pointing to huge page (and back) while interrupts are disabled.
> We clear pmd to possibly replace it with page table pointer in
> different code paths. So make sure we wait for the parallel
> find_linux_pte_or_hugepage to finish.

I'm not seeing here any description of the problem which is being
fixed. Does the patch make the machine faster? Does the machine
crash?

2015-05-08 22:24:31

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

On Thu, 7 May 2015 12:53:27 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:

> After this patch pmdp_* functions operate only on hugepage pte,
> and not on regular pmd_t values pointing to page table.
>

The patch looks like a pretty safe no-op for non-powerpc?

> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
> @@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
> extern void pmdp_splitting_flush(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp);
>
> +#define __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
> +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> + unsigned long address, pmd_t *pmdp);
> +

The fashionable way of doing this is

extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
#define pmdp_collapse_flush pmdp_collapse_flush

then, elsewhere,

#ifndef pmdp_collapse_flush
static inline pmd_t pmdp_collapse_flush(...) {}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif

It avoids introducing a second (ugly) symbol into the kernel.

2015-05-11 06:30:40

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH V2 2/2] powerpc/thp: Serialize pmd clear against a linux page table walk.

Andrew Morton <[email protected]> writes:

> On Thu, 7 May 2015 12:53:28 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
>> Serialize against find_linux_pte_or_hugepte which does lock-less
>> lookup in page tables with local interrupts disabled. For huge pages
>> it casts pmd_t to pte_t. Since format of pte_t is different from
>> pmd_t we want to prevent transit from pmd pointing to page table
>> to pmd pointing to huge page (and back) while interrupts are disabled.
>> We clear pmd to possibly replace it with page table pointer in
>> different code paths. So make sure we wait for the parallel
>> find_linux_pte_or_hugepage to finish.
>
> I'm not seeing here any description of the problem which is being
> fixed. Does the patch make the machine faster? Does the machine
> crash?

I sent v3 with updated commit message. Adding that below.

powerpc/thp: Serialize pmd clear against a linux page table walk.

Serialize against find_linux_pte_or_hugepte which does lock-less
lookup in page tables with local interrupts disabled. For huge pages
it casts pmd_t to pte_t. Since format of pte_t is different from
pmd_t we want to prevent transit from pmd pointing to page table
to pmd pointing to huge page (and back) while interrupts are disabled.
We clear pmd to possibly replace it with page table pointer in
different code paths. So make sure we wait for the parallel
find_linux_pte_or_hugepage to finish.

Without this patch, a find_linux_pte_or_hugepte running in parallel to
__split_huge_zero_page_pmd or do_huge_pmd_wp_page_fallback or zap_huge_pmd
can run into the above issue. With __split_huge_zero_page_pmd and
do_huge_pmd_wp_page_fallback we clear the hugepage pte before inserting
the pmd entry with a regular pgtable address. Such a clear need to
wait for the parallel find_linux_pte_or_hugepte to finish.

With zap_huge_pmd, we can run into issues, with a hugepage pte
getting zapped due to a MADV_DONTNEED while other cpu fault it
in as small pages.

-aneesh

2015-05-11 06:34:00

by Aneesh Kumar K.V

[permalink] [raw]

Subject: Re: [PATCH V2 1/2] mm/thp: Split out pmd collpase flush into a seperate functions

Andrew Morton <[email protected]> writes:

> On Thu, 7 May 2015 12:53:27 +0530 "Aneesh Kumar K.V" <[email protected]> wrote:
>
>> After this patch pmdp_* functions operate only on hugepage pte,
>> and not on regular pmd_t values pointing to page table.
>>
>
> The patch looks like a pretty safe no-op for non-powerpc?

That is correct. I also updated the commit message

mm/thp: Split out pmd collpase flush into a seperate functions

Architectures like ppc64 [1] need to do special things while clearing
pmd before a collapse. For them this operation is largely different
from a normal hugepage pte clear. Hence add a separate function
to clear pmd before collapse. After this patch pmdp_* functions
operate only on hugepage pte, and not on regular pmd_t values
pointing to page table.

[1] ppc64 needs to invalidate all the normal page pte mappings we
already have inserted in the hardware hash page table. But before
doing that we need to make sure there are no parallel hash page
table insert going on. So we need to do a kick_all_cpus_sync()
before flushing the older hash table entries. By moving this to
a separate function we capture these details and mention how it
is different from a hugepage pte clear.

>
>> --- a/arch/powerpc/include/asm/pgtable-ppc64.h
>> +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
>> @@ -576,6 +576,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
>> extern void pmdp_splitting_flush(struct vm_area_struct *vma,
>> unsigned long address, pmd_t *pmdp);
>>
>> +#define __HAVE_ARCH_PMDP_COLLAPSE_FLUSH
>> +extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
>> + unsigned long address, pmd_t *pmdp);
>> +
>
> The fashionable way of doing this is
>
> extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp);
> #define pmdp_collapse_flush pmdp_collapse_flush
>
> then, elsewhere,
>
> #ifndef pmdp_collapse_flush
> static inline pmd_t pmdp_collapse_flush(...) {}
> #define pmdp_collapse_flush pmdp_collapse_flush
> #endif
>
> It avoids introducing a second (ugly) symbol into the kernel.

Ok updated to the above style. The reason I used the earlier style was
because of similar usages in asm-generic/pgtable.h

-aneesh