On Fri, Nov 18, 2016 at 10:48:20AM +0800, Aaron Lu wrote:
> On 11/18/2016 01:53 AM, Linus Torvalds wrote:
> > I'm not entirely happy with the force_flush vs need_flush games, and I
> > really think this code should be updated to use the same "struct
> > mmu_gather" that we use for the other TLB flushing cases (no need for
> > the page freeing batching, but the tlb_flush_mmu_tlbonly() logic
> > should be the same).
>
> I see.
>
> >
> > But I guess that's a bigger change, so that wouldn't be approriate for
> > rc5 or stable back-porting anyway. But it would be lovely if somebody
> > could look at that. Hint hint.
>
> I'll work on it, thanks for the suggestion.
So here it is. I'm not quite sure if I've done the right thing in patch
2/2, i.e. should I just use tlb_flush_mmu or export tlb_flush_mmu_tlbonly
and then use it in mremap.c. Please take a look and let me know what you
think, thanks!
Regards,
Aaron
The mmu gather logic for tlb flush will be used in mremap case so export
this function.
Signed-off-by: Aaron Lu <[email protected]>
---
include/asm-generic/tlb.h | 1 +
mm/memory.c | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index c6d667187608..0f1861db935c 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -119,6 +119,7 @@ struct mmu_gather {
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end);
void tlb_flush_mmu(struct mmu_gather *tlb);
+void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb);
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
unsigned long end);
extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
diff --git a/mm/memory.c b/mm/memory.c
index e18c57bdc75c..130d82f7d8a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,7 +238,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
__tlb_reset_range(tlb);
}
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
if (!tlb->end)
return;
--
2.5.5
As suggested by Linus, the same mmu gather logic could be used for tlb
flush in mremap and this patch just did that.
Note that since there is no page added to "struct mmu_gather" for free
during mremap, when tlb needs to be flushed, we can use tlb_flush_mmu or
tlb_flush_mmu_tlbonly. Using tlb_flush_mmu could also avoid exporting
tlb_flush_mmu_tlbonly. But tlb_flush_mmu_tlbonly *looks* more clear and
straightforward so I end up using it.
Signed-off-by: Aaron Lu <[email protected]>
---
include/linux/huge_mm.h | 6 +++---
mm/huge_memory.c | 13 +++++++------
mm/mremap.c | 30 +++++++++++++++---------------
3 files changed, 25 insertions(+), 24 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e35e6de633b9..bbf64e05a49a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -20,9 +20,9 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush);
+extern bool move_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eff3de359d50..33909f16a9ad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1424,9 +1424,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+bool move_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
@@ -1456,8 +1456,11 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ tlb_remove_pmd_tlb_entry(tlb, old_pmd, old_addr);
if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
force_flush = true;
+
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
@@ -1471,9 +1474,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
if (force_flush)
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
- else
- *need_flush = true;
+ tlb_flush_mmu_tlbonly(tlb);
spin_unlock(old_ptl);
return true;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 6ccecc03f56a..dfac96ec7294 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,7 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include "internal.h"
@@ -101,16 +102,15 @@ static pte_t move_soft_dirty_pte(pte_t pte)
return pte;
}
-static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- unsigned long old_addr, unsigned long old_end,
+static void move_ptes(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *old_pmd, unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
+ unsigned long new_addr, bool need_rmap_locks)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
- unsigned long len = old_end - old_addr;
/*
* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
@@ -149,6 +149,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (pte_none(*old_pte))
continue;
+ tlb_remove_tlb_entry(tlb, old_pte, old_addr);
/*
* We are remapping a dirty PTE, make sure to
* flush TLB before we drop the PTL for the
@@ -166,10 +167,9 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
+
if (force_flush)
- flush_tlb_range(vma, old_end - len, old_end);
- else
- *need_flush = true;
+ tlb_flush_mmu_tlbonly(tlb);
pte_unmap_unlock(old_pte - 1, old_ptl);
if (need_rmap_locks)
drop_rmap_locks(vma);
@@ -184,15 +184,16 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
- bool need_flush = false;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_gather tlb;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
mmun_start = old_addr;
mmun_end = old_end;
+ tlb_gather_mmu(&tlb, vma->vm_mm, mmun_start, mmun_end);
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
@@ -214,9 +215,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
/* See comment in move_ptes() */
if (need_rmap_locks)
take_rmap_locks(vma);
- moved = move_huge_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd,
- &need_flush);
+ moved = move_huge_pmd(&tlb, vma, old_addr,
+ new_addr, old_end,
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -233,13 +234,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = next - new_addr;
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
- move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
- new_pmd, new_addr, need_rmap_locks, &need_flush);
+ move_ptes(&tlb, vma, old_pmd, old_addr, old_addr + extent,
+ new_vma, new_pmd, new_addr, need_rmap_locks);
}
- if (need_flush)
- flush_tlb_range(vma, old_end-len, old_addr);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ tlb_finish_mmu(&tlb, mmun_start, mmun_end);
return len + old_addr - old_end; /* how much done */
}
--
2.5.5
On Mon, Nov 28, 2016 at 12:40 AM, Aaron Lu <[email protected]> wrote:
> As suggested by Linus, the same mmu gather logic could be used for tlb
> flush in mremap and this patch just did that.
Ok, looking at this patch, I still think it looks like the right thing
to do, but I'm admittedly rather less certain of it.
The main advantage of the mmu_gather thing is that it automatically
takes care of the TLB flush ranges for us, and that's a big deal
during munmap() (where the actual unmapped page range can be _very_
different from the total range), but now that I notice that this
doesn't actually remove any other code (in fact, it adds a line), I'm
wondering if it's worth it. mremap() is already "dense" in the vma
space, unlike munmap (ie you can't move multiple vma's with a single
mremap), so the fancy range optimizations that make a difference on
some architectures aren't much of an issue.
So I guess the MM people should take a look at this and say whether
they think the current state is fine or whether we should do the
mmu_gather thing. People?
However, I also independently think I found an actual bug while
looking at the code as part of looking at the patch.
This part looks racy:
/*
* We are remapping a dirty PTE, make sure to
* flush TLB before we drop the PTL for the
* old PTE or we may race with page_mkclean().
*/
if (pte_present(*old_pte) && pte_dirty(*old_pte))
force_flush = true;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
where the issue is that another thread might make the pte be dirty (in
the hardware walker, so no locking of ours make any difference)
*after* we checked whether it was dirty, but *before* we removed it
from the page tables.
So I think the "check for force-flush" needs to come *after*, and we should do
pte = ptep_get_and_clear(mm, old_addr, old_pte);
if (pte_present(pte) && pte_dirty(pte))
force_flush = true;
instead.
This happens for the pmd case too.
So now I'm not sure the mmu_gather thing is worth it, but I'm pretty
sure that there remains a (very very) small race that wasn't fixed by
the original fix in commit 5d1904204c99 ("mremap: fix race between
mremap() and page cleanning").
Aaron, sorry for waffling about this, and asking you to look at a
completely different issue instead.
Linus
On 11/28/2016 12:40 AM, Aaron Lu wrote:
> As suggested by Linus, the same mmu gather logic could be used for tlb
> flush in mremap and this patch just did that.
>
> Note that since there is no page added to "struct mmu_gather" for free
> during mremap, when tlb needs to be flushed, we can use tlb_flush_mmu or
> tlb_flush_mmu_tlbonly. Using tlb_flush_mmu could also avoid exporting
> tlb_flush_mmu_tlbonly. But tlb_flush_mmu_tlbonly *looks* more clear and
> straightforward so I end up using it.
OK, so the code before this patch was passing around a pointer to
'need_flush', and we basically just pass around an mmu_gather instead.
It doesn't really simplify the code _too_ much, although it does make it
less confusing than when we saw 'need_flush' mixed with 'force_flush' in
the code.
tlb_flush_mmu_tlbonly() has exactly one other use: zap_pte_range() for
flushing the TLB before pte_unmap_unlock():
if (force_flush)
tlb_flush_mmu_tlbonly(tlb);
pte_unmap_unlock(start_pte, ptl);
But, both call-sites are still keeping 'force_flush' to store the
information about whether we ever saw a dirty pte. If we moved _that_
logic into the x86 mmu_gather code, we could get rid of all the
'force_flush' tracking in both call sites. It also makes us a bit more
future-proof against these page_mkclean() races if we ever grow a third
site for clearing ptes.
Instead of exporting and calling tlb_flush_mmu_tlbonly(), we'd need
something like tlb_flush_mmu_before_ptl_release() (but with a better
name, of course :).
On Mon, Nov 28, 2016 at 9:32 AM, Dave Hansen <[email protected]> wrote:
>
> But, both call-sites are still keeping 'force_flush' to store the
> information about whether we ever saw a dirty pte. If we moved _that_
> logic into the x86 mmu_gather code, we could get rid of all the
> 'force_flush' tracking in both call sites. It also makes us a bit more
> future-proof against these page_mkclean() races if we ever grow a third
> site for clearing ptes.
Yeah, that sounds like a nice cleanup and would put all the real state
into that mmu_gather structure.
Linus
On 11/29/2016 01:15 AM, Linus Torvalds wrote:
> However, I also independently think I found an actual bug while
> looking at the code as part of looking at the patch.
>
> This part looks racy:
>
> /*
> * We are remapping a dirty PTE, make sure to
> * flush TLB before we drop the PTL for the
> * old PTE or we may race with page_mkclean().
> */
> if (pte_present(*old_pte) && pte_dirty(*old_pte))
> force_flush = true;
> pte = ptep_get_and_clear(mm, old_addr, old_pte);
>
> where the issue is that another thread might make the pte be dirty (in
> the hardware walker, so no locking of ours make any difference)
> *after* we checked whether it was dirty, but *before* we removed it
> from the page tables.
Ah, very right. Thanks for the catch!
>
> So I think the "check for force-flush" needs to come *after*, and we should do
>
> pte = ptep_get_and_clear(mm, old_addr, old_pte);
> if (pte_present(pte) && pte_dirty(pte))
> force_flush = true;
>
> instead.
>
> This happens for the pmd case too.
Here is a fix patch, sorry for the trouble.
>From c0dc52fd3d3be93afb5b97804937a1b1b7ef136e Mon Sep 17 00:00:00 2001
From: Aaron Lu <[email protected]>
Date: Tue, 29 Nov 2016 10:33:37 +0800
Subject: [PATCH] mremap: move_ptes: check pte dirty after its removal
Linus found there still is a race in mremap after commit 5d1904204c99
("mremap: fix race between mremap() and page cleanning").
As described by Linus:
the issue is that another thread might make the pte be dirty (in
the hardware walker, so no locking of ours make any difference)
*after* we checked whether it was dirty, but *before* we removed it
from the page tables.
Fix it by moving the check after we removed it from the page table.
Suggested-by: Linus Torvalds <[email protected]>
Signed-off-by: Aaron Lu <[email protected]>
---
mm/huge_memory.c | 2 +-
mm/mremap.c | 6 +++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eff3de359d50..a3e466c489a9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1456,9 +1456,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
force_flush = true;
- pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
diff --git a/mm/mremap.c b/mm/mremap.c
index 6ccecc03f56a..4b39dd0974e5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -149,14 +149,18 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (pte_none(*old_pte))
continue;
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* We are remapping a dirty PTE, make sure to
* flush TLB before we drop the PTL for the
* old PTE or we may race with page_mkclean().
+ *
+ * This check has to be done after we removed the
+ * old PTE from page tables or another thread may
+ * dirty it after the check and before the removal.
*/
if (pte_present(*old_pte) && pte_dirty(*old_pte))
force_flush = true;
- pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
--
2.5.5
On Mon, Nov 28, 2016 at 6:57 PM, Aaron Lu <[email protected]> wrote:
>
> Here is a fix patch, sorry for the trouble.
I don't think you tested this one.. You've now essentially reverted
5d1904204c99 entirely by making the new force_flush logic a no-op.
> + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
> if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
> force_flush = true;
You need to be testing "pmd", not "*old_pmd".
Because now "*old_pmd" will be zeroes.
> if (pte_present(*old_pte) && pte_dirty(*old_pte))
> force_flush = true;
Similarly here. You need to check "pte", not "*old_pte".
Linus
On Mon, Nov 28, 2016 at 07:06:39PM -0800, Linus Torvalds wrote:
> On Mon, Nov 28, 2016 at 6:57 PM, Aaron Lu <[email protected]> wrote:
> >
> > Here is a fix patch, sorry for the trouble.
>
> I don't think you tested this one.. You've now essentially reverted
> 5d1904204c99 entirely by making the new force_flush logic a no-op.
Right, I just did a build test.
Now I'm doing more tests, sorry for being careless.
Regards,
Aaron
>
> > + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
> > if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
> > force_flush = true;
>
> You need to be testing "pmd", not "*old_pmd".
>
> Because now "*old_pmd" will be zeroes.
>
> > if (pte_present(*old_pte) && pte_dirty(*old_pte))
> > force_flush = true;
>
> Similarly here. You need to check "pte", not "*old_pte".
>
> Linus
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
Linus found there still is a race in mremap after commit 5d1904204c99
("mremap: fix race between mremap() and page cleanning").
As described by Linus:
the issue is that another thread might make the pte be dirty (in
the hardware walker, so no locking of ours make any difference)
*after* we checked whether it was dirty, but *before* we removed it
from the page tables.
Fix it by moving the check after we removed it from the page table.
Suggested-by: Linus Torvalds <[email protected]>
Signed-off-by: Aaron Lu <[email protected]>
---
mm/huge_memory.c | 4 ++--
mm/mremap.c | 8 ++++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eff3de359d50..d4a6e4001512 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1456,9 +1456,9 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
- force_flush = true;
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
+ if (pmd_present(pmd) && pmd_dirty(pmd))
+ force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
diff --git a/mm/mremap.c b/mm/mremap.c
index 6ccecc03f56a..53df7ec8d2ba 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -149,14 +149,18 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (pte_none(*old_pte))
continue;
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
/*
* We are remapping a dirty PTE, make sure to
* flush TLB before we drop the PTL for the
* old PTE or we may race with page_mkclean().
+ *
+ * This check has to be done after we removed the
+ * old PTE from page tables or another thread may
+ * dirty it after the check and before the removal.
*/
- if (pte_present(*old_pte) && pte_dirty(*old_pte))
+ if (pte_present(pte) && pte_dirty(pte))
force_flush = true;
- pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
--
2.5.5