Commit 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 (mm: wrap calls to
set_pte_at_notify with invalidate_range_start and invalidate_range_end)
breaks semantics of set_pte_at_notify. When calls to set_pte_at_notify
are wrapped with mmu_notifier_invalidate_range_start and
mmu_notifier_invalidate_range_end, KVM zaps pte during
mmu_notifier_invalidate_range_start callback and set_pte_at_notify has
no spte to update and therefore it's called for nothing.
As Andrea suggested (1), the problem is resolved by calling
mmu_notifier_invalidate_page after PT lock has been released and only
for mmu_notifiers that do not implement change_ptr callback.
(1) http://thread.gmane.org/gmane.linux.kernel.mm/111710/focus=111711
Reported-by: Izik Eidus <[email protected]>
Signed-off-by: Mike Rapoport <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Haggai Eran <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
include/linux/mmu_notifier.h | 31 ++++++++++++++++++++++++++-----
kernel/events/uprobes.c | 12 ++++++------
mm/ksm.c | 15 +++++----------
mm/memory.c | 14 +++++---------
mm/mmu_notifier.c | 24 ++++++++++++++++++++++--
5 files changed, 64 insertions(+), 32 deletions(-)
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index deca874..46c96cb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -176,14 +176,17 @@ extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long address);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address);
-extern void __mmu_notifier_change_pte(struct mm_struct *mm,
- unsigned long address, pte_t pte);
+extern int __mmu_notifier_change_pte(struct mm_struct *mm,
+ unsigned long address, pte_t pte);
extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
unsigned long address);
extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
unsigned long start, unsigned long end);
extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern void
+__mmu_notifier_invalidate_page_if_missing_change_pte(struct mm_struct *mm,
+ unsigned long address);
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -207,11 +210,12 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
return 0;
}
-static inline void mmu_notifier_change_pte(struct mm_struct *mm,
+static inline int mmu_notifier_change_pte(struct mm_struct *mm,
unsigned long address, pte_t pte)
{
if (mm_has_notifiers(mm))
- __mmu_notifier_change_pte(mm, address, pte);
+ return __mmu_notifier_change_pte(mm, address, pte);
+ return 0;
}
static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -235,6 +239,15 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
__mmu_notifier_invalidate_range_end(mm, start, end);
}
+static inline
+void mmu_notifier_invalidate_page_if_missing_change_pte(struct mm_struct *mm,
+ unsigned long address)
+{
+ if (mm_has_notifiers(mm))
+ __mmu_notifier_invalidate_page_if_missing_change_pte(mm,
+ address);
+}
+
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
mm->mmu_notifier_mm = NULL;
@@ -283,9 +296,11 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
struct mm_struct *___mm = __mm; \
unsigned long ___address = __address; \
pte_t ___pte = __pte; \
+ int ___ret; \
\
- mmu_notifier_change_pte(___mm, ___address, ___pte); \
+ ___ret = mmu_notifier_change_pte(___mm, ___address, ___pte); \
set_pte_at(___mm, ___address, __ptep, ___pte); \
+ ___ret; \
})
#else /* CONFIG_MMU_NOTIFIER */
@@ -326,6 +341,12 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm,
{
}
+static inline
+void mmu_notifier_invalidate_page_if_missing_change_pte(struct mm_struct *mm,
+ unsigned long address)
+{
+}
+
static inline void mmu_notifier_mm_init(struct mm_struct *mm)
{
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 24b7d6c..ec49338 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -131,14 +131,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
pte_t *ptep;
int err;
- /* For mmu_notifiers */
- const unsigned long mmun_start = addr;
- const unsigned long mmun_end = addr + PAGE_SIZE;
+ int notify_missing = 0;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
@@ -154,20 +151,23 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
+ notify_missing = set_pte_at_notify(mm, addr, ptep,
+ mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
if (!page_mapped(page))
try_to_free_swap(page);
pte_unmap_unlock(ptep, ptl);
+ if (notify_missing)
+ mmu_notifier_invalidate_page_if_missing_change_pte(mm, addr);
+
if (vma->vm_flags & VM_LOCKED)
munlock_vma_page(page);
put_page(page);
err = 0;
unlock:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff7..42e8254 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -861,8 +861,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ int notify_missing = 0;
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
@@ -870,13 +869,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmun_start = addr;
- mmun_end = addr + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
- goto out_mn;
+ goto out;
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
@@ -904,15 +899,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (pte_dirty(entry))
set_page_dirty(page);
entry = pte_mkclean(pte_wrprotect(entry));
- set_pte_at_notify(mm, addr, ptep, entry);
+ notify_missing = set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
err = 0;
out_unlock:
pte_unmap_unlock(ptep, ptl);
-out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ if (notify_missing)
+ mmu_notifier_invalidate_page_if_missing_change_pte(mm, addr);
out:
return err;
}
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9..596d4c3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2611,8 +2611,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
+ int notify_missing = 0;
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2798,10 +2797,6 @@ gotten:
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
/*
* Re-check the pte - we dropped the lock
*/
@@ -2830,7 +2825,8 @@ gotten:
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, address, page_table, entry);
+ notify_missing = set_pte_at_notify(mm, address, page_table,
+ entry);
update_mmu_cache(vma, address, page_table);
if (old_page) {
/*
@@ -2868,8 +2864,8 @@ gotten:
page_cache_release(new_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ if (notify_missing)
+ mmu_notifier_invalidate_page_if_missing_change_pte(mm, address);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089..5fc5bc2 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -122,18 +122,23 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
return young;
}
-void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
- pte_t pte)
+int __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
+ pte_t pte)
{
struct mmu_notifier *mn;
int id;
+ int ret = 0;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
+ else
+ ret = 1;
}
srcu_read_unlock(&srcu, id);
+
+ return ret;
}
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -180,6 +185,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
+void __mmu_notifier_invalidate_page_if_missing_change_pte(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ int id;
+
+ id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->invalidate_page && !mn->ops->change_pte)
+ mn->ops->invalidate_page(mn, mm, address);
+ }
+ srcu_read_unlock(&srcu, id);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_page_if_missing_change_pte);
+
static int do_mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm,
int take_mmap_sem)
--
1.8.1.5
On Wed, Jan 15, 2014 at 11:40:34AM +0200, Mike Rapoport wrote:
> Commit 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 (mm: wrap calls to
> set_pte_at_notify with invalidate_range_start and invalidate_range_end)
> breaks semantics of set_pte_at_notify. When calls to set_pte_at_notify
> are wrapped with mmu_notifier_invalidate_range_start and
> mmu_notifier_invalidate_range_end, KVM zaps pte during
> mmu_notifier_invalidate_range_start callback and set_pte_at_notify has
> no spte to update and therefore it's called for nothing.
>
> As Andrea suggested (1), the problem is resolved by calling
> mmu_notifier_invalidate_page after PT lock has been released and only
> for mmu_notifiers that do not implement change_ptr callback.
>
> (1) http://thread.gmane.org/gmane.linux.kernel.mm/111710/focus=111711
>
> Reported-by: Izik Eidus <[email protected]>
> Signed-off-by: Mike Rapoport <[email protected]>
> Cc: Andrea Arcangeli <[email protected]>
> Cc: Haggai Eran <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> ---
> include/linux/mmu_notifier.h | 31 ++++++++++++++++++++++++++-----
> kernel/events/uprobes.c | 12 ++++++------
> mm/ksm.c | 15 +++++----------
> mm/memory.c | 14 +++++---------
> mm/mmu_notifier.c | 24 ++++++++++++++++++++++--
> 5 files changed, 64 insertions(+), 32 deletions(-)
Reviewed-by: Andrea Arcangeli <[email protected]>
Thanks!
Andrea
On 22/01/2014 15:10, Andrea Arcangeli wrote:
> On Wed, Jan 15, 2014 at 11:40:34AM +0200, Mike Rapoport wrote:
>> Commit 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 (mm: wrap calls to
>> set_pte_at_notify with invalidate_range_start and invalidate_range_end)
>> breaks semantics of set_pte_at_notify. When calls to set_pte_at_notify
>> are wrapped with mmu_notifier_invalidate_range_start and
>> mmu_notifier_invalidate_range_end, KVM zaps pte during
>> mmu_notifier_invalidate_range_start callback and set_pte_at_notify has
>> no spte to update and therefore it's called for nothing.
>>
>> As Andrea suggested (1), the problem is resolved by calling
>> mmu_notifier_invalidate_page after PT lock has been released and only
>> for mmu_notifiers that do not implement change_ptr callback.
>>
>> (1) http://thread.gmane.org/gmane.linux.kernel.mm/111710/focus=111711
>>
>> Reported-by: Izik Eidus <[email protected]>
>> Signed-off-by: Mike Rapoport <[email protected]>
>> Cc: Andrea Arcangeli <[email protected]>
>> Cc: Haggai Eran <[email protected]>
>> Cc: Peter Zijlstra <[email protected]>
>> ---
>> include/linux/mmu_notifier.h | 31 ++++++++++++++++++++++++++-----
>> kernel/events/uprobes.c | 12 ++++++------
>> mm/ksm.c | 15 +++++----------
>> mm/memory.c | 14 +++++---------
>> mm/mmu_notifier.c | 24 ++++++++++++++++++++++--
>> 5 files changed, 64 insertions(+), 32 deletions(-)
>
> Reviewed-by: Andrea Arcangeli <[email protected]>
>
Hi Andrea, Mike,
Did you get a chance to consider the scenario I wrote about in the other
thread?
I'm worried about the following scenario:
Given a read-only page, suppose one host thread (thread 1) writes to
that page, and performs COW, but before it calls the
mmu_notifier_invalidate_page_if_missing_change_pte function another host
thread (thread 2) writes to the same page (this time without a page
fault). Then we have a valid entry in the secondary page table to a
stale page, and someone (thread 3) may read stale data from there.
Here's a diagram that shows this scenario:
Thread 1 | Thread 2 | Thread 3
========================================================================
do_wp_page(page 1) | |
... | |
set_pte_at_notify | |
... | write to page 1 |
| | stale access
pte_unmap_unlock | |
invalidate_page_if_missing_change_pte | |
This is currently prevented by the use of the range start and range end
notifiers.
Do you agree that this scenario is possible with the new patch, or am I
missing something?
Regards,
Haggai
On Wed, 15 Jan 2014 11:40:34 +0200 Mike Rapoport <[email protected]> wrote:
> Commit 6bdb913f0a70a4dfb7f066fb15e2d6f960701d00 (mm: wrap calls to
> set_pte_at_notify with invalidate_range_start and invalidate_range_end)
> breaks semantics of set_pte_at_notify. When calls to set_pte_at_notify
> are wrapped with mmu_notifier_invalidate_range_start and
> mmu_notifier_invalidate_range_end, KVM zaps pte during
> mmu_notifier_invalidate_range_start callback and set_pte_at_notify has
> no spte to update and therefore it's called for nothing.
>
> As Andrea suggested (1), the problem is resolved by calling
> mmu_notifier_invalidate_page after PT lock has been released and only
> for mmu_notifiers that do not implement change_ptr callback.
>
> (1) http://thread.gmane.org/gmane.linux.kernel.mm/111710/focus=111711
The changelog fails to describe the end-user visible effects of the
bug, so I (and others) will be unable to decide which kernel versions
need patching
Given that the bug has been around for 1.5 years I assume the priority
is low.
The patch appears to assume that a single call to
->invalidate_range_start()/->invalidate_range_end() is equivalent to a
series of per-page calls to ->invalidate_page(), yes? I'd have
expected to see some discussion of this in the changelog.
I expect this change will make drivers/misc/sgi-gru/grutlbpurge.c run
slower, but I don't know how much. And GRU will now be missing its
wake_up_all() during ->invalidate_range_all() - I don't know what
effect that will have.
I didn't review any other affected callers. Please do this and
explain the expected impact. Please also cc the affected developers
and ensure that enough information is provided so they can understand
the effect the patch has upon their code.
Generally, the patch is really ugly :( We have a nice consistent and
symmetrical pattern of calling
->invalidate_range_start()/->invalidate_range_end() and this patch
comes along and tears great holes in it by removing those calls from a
subset of places and replacing them with open-coded calls to
single-page ->invalidate_page(). Isn't there some (much) nicer way of
doing all this?
> @@ -283,9 +296,11 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
> struct mm_struct *___mm = __mm; \
> unsigned long ___address = __address; \
> pte_t ___pte = __pte; \
> + int ___ret; \
> \
> - mmu_notifier_change_pte(___mm, ___address, ___pte); \
> + ___ret = mmu_notifier_change_pte(___mm, ___address, ___pte); \
> set_pte_at(___mm, ___address, __ptep, ___pte); \
> + ___ret; \
> })
set_pte_at_notify() used to be well-documented. Now it is missing
documentation for its return value.
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -131,14 +131,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
> spinlock_t *ptl;
> pte_t *ptep;
> int err;
> - /* For mmu_notifiers */
> - const unsigned long mmun_start = addr;
> - const unsigned long mmun_end = addr + PAGE_SIZE;
> + int notify_missing = 0;
Using a bool would be clearer.
The initialisation appears to be unnecessary.
> /* For try_to_free_swap() and munlock_vma_page() below */
> lock_page(page);
>
> - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> err = -EAGAIN;
> ptep = page_check_address(page, mm, addr, &ptl, 0);
> if (!ptep)
> @@ -154,20 +151,23 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
>
> flush_cache_page(vma, addr, pte_pfn(*ptep));
> ptep_clear_flush(vma, addr, ptep);
> - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
> + notify_missing = set_pte_at_notify(mm, addr, ptep,
> + mk_pte(kpage, vma->vm_page_prot));
>
> page_remove_rmap(page);
> if (!page_mapped(page))
> try_to_free_swap(page);
> pte_unmap_unlock(ptep, ptl);
>
> + if (notify_missing)
> + mmu_notifier_invalidate_page_if_missing_change_pte(mm, addr);
Some comments would be helpful here. At least to explain to the reader
that we're pulling tricks to avoid calling the notifiers under
pte_offset_map_lock().
> if (vma->vm_flags & VM_LOCKED)
> munlock_vma_page(page);
> put_page(page);
>
> err = 0;
> unlock:
> - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> unlock_page(page);
> return err;
> }
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 175fff7..42e8254 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -861,8 +861,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
> spinlock_t *ptl;
> int swapped;
> int err = -EFAULT;
> - unsigned long mmun_start; /* For mmu_notifiers */
> - unsigned long mmun_end; /* For mmu_notifiers */
> + int notify_missing = 0;
bool?
Here the initialisation appears to be needed.
> addr = page_address_in_vma(page, vma);
> if (addr == -EFAULT)
> @@ -870,13 +869,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
>
> BUG_ON(PageTransCompound(page));
>
> - mmun_start = addr;
> - mmun_end = addr + PAGE_SIZE;
> - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> -
> ptep = page_check_address(page, mm, addr, &ptl, 0);
> if (!ptep)
> - goto out_mn;
> + goto out;
>
> if (pte_write(*ptep) || pte_dirty(*ptep)) {
> pte_t entry;
> @@ -904,15 +899,15 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
> if (pte_dirty(entry))
> set_page_dirty(page);
> entry = pte_mkclean(pte_wrprotect(entry));
> - set_pte_at_notify(mm, addr, ptep, entry);
> + notify_missing = set_pte_at_notify(mm, addr, ptep, entry);
> }
> *orig_pte = *ptep;
> err = 0;
>
> out_unlock:
> pte_unmap_unlock(ptep, ptl);
> -out_mn:
> - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> + if (notify_missing)
> + mmu_notifier_invalidate_page_if_missing_change_pte(mm, addr);
> out:
> return err;
> }
> diff --git a/mm/memory.c b/mm/memory.c
> index 6768ce9..596d4c3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2611,8 +2611,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
> int ret = 0;
> int page_mkwrite = 0;
> struct page *dirty_page = NULL;
> - unsigned long mmun_start = 0; /* For mmu_notifiers */
> - unsigned long mmun_end = 0; /* For mmu_notifiers */
> + int notify_missing = 0;
dittoes.
> old_page = vm_normal_page(vma, address, orig_pte);
> if (!old_page) {
> @@ -2798,10 +2797,6 @@ gotten:
> if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
> goto oom_free_new;
>
> - mmun_start = address & PAGE_MASK;
> - mmun_end = mmun_start + PAGE_SIZE;
> - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> -
> /*
> * Re-check the pte - we dropped the lock
> */
> @@ -2830,7 +2825,8 @@ gotten:
> * mmu page tables (such as kvm shadow page tables), we want the
> * new page to be mapped directly into the secondary page table.
> */
> - set_pte_at_notify(mm, address, page_table, entry);
> + notify_missing = set_pte_at_notify(mm, address, page_table,
> + entry);
> update_mmu_cache(vma, address, page_table);
> if (old_page) {
> /*
> @@ -2868,8 +2864,8 @@ gotten:
> page_cache_release(new_page);
> unlock:
> pte_unmap_unlock(page_table, ptl);
> - if (mmun_end > mmun_start)
> - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> + if (notify_missing)
> + mmu_notifier_invalidate_page_if_missing_change_pte(mm, address);
> if (old_page) {
> /*
> * Don't let another task, with possibly unlocked vma,
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 93e6089..5fc5bc2 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -122,18 +122,23 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
> return young;
> }
>
> -void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
> - pte_t pte)
> +int __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
> + pte_t pte)
return bool?
Document the return value (at least).
> {
> struct mmu_notifier *mn;
> int id;
> + int ret = 0;
>
> id = srcu_read_lock(&srcu);
> hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> if (mn->ops->change_pte)
> mn->ops->change_pte(mn, mm, address, pte);
> + else
> + ret = 1;
> }
> srcu_read_unlock(&srcu, id);
> +
> + return ret;
> }
>
> void __mmu_notifier_invalidate_page(struct mm_struct *mm,
> @@ -180,6 +185,21 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
> }
> EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
>
> +void __mmu_notifier_invalidate_page_if_missing_change_pte(struct mm_struct *mm,
> + unsigned long address)
> +{
> + struct mmu_notifier *mn;
> + int id;
> +
> + id = srcu_read_lock(&srcu);
> + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> + if (mn->ops->invalidate_page && !mn->ops->change_pte)
> + mn->ops->invalidate_page(mn, mm, address);
> + }
> + srcu_read_unlock(&srcu, id);
> +}
> +EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_page_if_missing_change_pte);
Definitely needs thorough documenting, please.
On Wed, Jan 22, 2014 at 01:54:59PM -0800, Andrew Morton wrote:
> The changelog fails to describe the end-user visible effects of the
> bug, so I (and others) will be unable to decide which kernel versions
> need patching
>
> Given that the bug has been around for 1.5 years I assume the priority
> is low.
The priority is low, it's about a performance optimization
only.
change_pte avoids a vmexit when the guest first access the page after
a KSM cow break, or after a KSM merge.
But the change_pte method become worthless, it was still called but it
did nothing with the current common code in memory.c and ksm.c.
In the old days KVM would call gup_fast(write=1). These days write=1
is not forced always on and a secondary MMU read fault calls gup_fast
with write=0. So in the old days without a fully functional change_pte
invocation, KSM merged pages could never be read from the guest
without first breaking them with a COW. So it would have been a
showstopper if change_pte wouldn't work.
These days the KVM secondary MMU page fault handler become more
advanced and it's just a vmexit optimization.
> Generally, the patch is really ugly :( We have a nice consistent and
It would get even uglier once we'd fix the problem Haggai pointed out
in another email on this thread, by keeping the pte wrprotected until
we call the mmu_notifier invalidate and in short doing 1 more TLB
flush to convert it to a writable pte, if the change_pte method wasn't
implemented for some registered mmu notifier for the mm.
That problem isn't the end of the world and is fixable but the
do_wp_page code gets even more hairy after fixing it with this
approach.
> symmetrical pattern of calling
> ->invalidate_range_start()/->invalidate_range_end() and this patch
> comes along and tears great holes in it by removing those calls from a
> subset of places and replacing them with open-coded calls to
> single-page ->invalidate_page(). Isn't there some (much) nicer way of
> doing all this?
The fundamental problem is that change_pte acts only on established on
secondary MMU mappings. So if we teardown the secondary mmu mappings
with invalidate_range_start, we can as well skip calling change_pte in
set_pte_at_notify, and just use set_pte_at instead.
Something must be done about it because current code just doesn't make
sense to keep as is.
Possible choices:
1) we drop change_pte completely (not fully evaluated the impact vs
current production code where change_pte was in full effect and
skipping some vmexit with KSM activity). It won't be slower than
current upstream, it may be slower than current production code
with KSM in usage. We need to benchmark it to see if it's
measurable...
2) we fix it with this patch, plus adding a further step to keep the
pte wrprotected until we flush the secondary mmu mapping.
3) we change the semantics of ->change_pte not to just change, but to
establish not-existent secondary mmu mappings. So far the only way
to establish secondary mmu mappings would have been outside of the
mmu notifier, through regular secondary MMU page faults invoking
gup_fast or one of the gup variants. That may be tricky as
change_pte must be non blocking so it cannot reliably allocate
memory.
Comments welcome,
Andrea
On 03/30/2014 11:33 PM, Jerome Glisse wrote:
> On Wed, Jan 22, 2014 at 04:01:15PM +0200, Haggai Eran wrote:
>> I'm worried about the following scenario:
>>
>> Given a read-only page, suppose one host thread (thread 1) writes to
>> that page, and performs COW, but before it calls the
>> mmu_notifier_invalidate_page_if_missing_change_pte function another host
>> thread (thread 2) writes to the same page (this time without a page
>> fault). Then we have a valid entry in the secondary page table to a
>> stale page, and someone (thread 3) may read stale data from there.
>>
>> Here's a diagram that shows this scenario:
>>
>> Thread 1 | Thread 2 | Thread 3
>> ========================================================================
>> do_wp_page(page 1) | |
>> ... | |
>> set_pte_at_notify | |
>> ... | write to page 1 |
>> | | stale access
>> pte_unmap_unlock | |
>> invalidate_page_if_missing_change_pte | |
>>
>> This is currently prevented by the use of the range start and range end
>> notifiers.
>>
>> Do you agree that this scenario is possible with the new patch, or am I
>> missing something?
>>
> I believe you are right, but of all the upstream user of the mmu_notifier
> API only xen would suffer from this ie any user that do not have a proper
> change_pte callback can see the bogus scenario you describe above.
Yes. I sent our RDMA paging RFC patch-set on linux-rdma [1] last month,
and it would also suffer from this scenario, but it's not upstream yet.
> The issue i see is with user that want to/or might sleep when they are
> invalidation the secondary page table. The issue being that change_pte is
> call with the cpu page table locked (well at least for the affected pmd).
>
> I would rather keep the invalidate_range_start/end bracket around change_pte
> and invalidate page. I think we can fix the kvm regression by other means.
Perhaps another possibility would be to do the
invalidate_range_start/end bracket only when the mmu_notifier is missing
a change_pte implementation.
Best regards,
Haggai
[1] http://www.spinics.net/lists/linux-rdma/msg18906.html
On Wed, Apr 02, 2014 at 03:52:45PM +0300, Haggai Eran wrote:
> On 03/30/2014 11:33 PM, Jerome Glisse wrote:
> >On Wed, Jan 22, 2014 at 04:01:15PM +0200, Haggai Eran wrote:
> >>I'm worried about the following scenario:
> >>
> >>Given a read-only page, suppose one host thread (thread 1) writes to
> >>that page, and performs COW, but before it calls the
> >>mmu_notifier_invalidate_page_if_missing_change_pte function another host
> >>thread (thread 2) writes to the same page (this time without a page
> >>fault). Then we have a valid entry in the secondary page table to a
> >>stale page, and someone (thread 3) may read stale data from there.
> >>
> >>Here's a diagram that shows this scenario:
> >>
> >>Thread 1 | Thread 2 | Thread 3
> >>========================================================================
> >>do_wp_page(page 1) | |
> >> ... | |
> >> set_pte_at_notify | |
> >> ... | write to page 1 |
> >> | | stale access
> >> pte_unmap_unlock | |
> >> invalidate_page_if_missing_change_pte | |
> >>
> >>This is currently prevented by the use of the range start and range end
> >>notifiers.
> >>
> >>Do you agree that this scenario is possible with the new patch, or am I
> >>missing something?
> >>
> >I believe you are right, but of all the upstream user of the mmu_notifier
> >API only xen would suffer from this ie any user that do not have a proper
> >change_pte callback can see the bogus scenario you describe above.
> Yes. I sent our RDMA paging RFC patch-set on linux-rdma [1] last
> month, and it would also suffer from this scenario, but it's not
> upstream yet.
> >The issue i see is with user that want to/or might sleep when they are
> >invalidation the secondary page table. The issue being that change_pte is
> >call with the cpu page table locked (well at least for the affected pmd).
> >
> >I would rather keep the invalidate_range_start/end bracket around change_pte
> >and invalidate page. I think we can fix the kvm regression by other means.
> Perhaps another possibility would be to do the
> invalidate_range_start/end bracket only when the mmu_notifier is
> missing a change_pte implementation.
This would imply either to scan all mmu_notifier currently register or to
have a global flags for the mm to know if there is one mmu_notifier without
change_pte. Moreover this would means that kvm would remain "broken" if one
of the mmu notifier do not have the change_pte callback.
Solution i have in mind and is part of a patchset i am working on, just
involve passing along an enum value to mmu notifier callback. The enum
value would tell what are the exact event that actually triggered the
mmu notifier call (vmscan, migrate, ksm, ...). Knowing this kvm could then
simply ignore invalidate_range_start/end for event it knows it will get
a change_pte callback.
Cheers,
J?r?me
>
> Best regards,
> Haggai
>
> [1] http://www.spinics.net/lists/linux-rdma/msg18906.html
Hi,
On Wed, Apr 02, 2014 at 11:18:27AM -0400, Jerome Glisse wrote:
> This would imply either to scan all mmu_notifier currently register or to
> have a global flags for the mm to know if there is one mmu_notifier without
> change_pte. Moreover this would means that kvm would remain "broken" if one
> of the mmu notifier do not have the change_pte callback.
>
> Solution i have in mind and is part of a patchset i am working on, just
> involve passing along an enum value to mmu notifier callback. The enum
> value would tell what are the exact event that actually triggered the
> mmu notifier call (vmscan, migrate, ksm, ...). Knowing this kvm could then
> simply ignore invalidate_range_start/end for event it knows it will get
> a change_pte callback.
That sounds similar to adding two new methods companion of change_pte
that if not implemented would fallback into
invalidate_range_start/end? And KVM would implement those as noops? It
would add bytes to the mmu notifer structure but it wouldn't add
branches.
It's not urgent but we clearly need to do something about this, or we
should drop change_pte entirely because currently it does nothing and
it only wastes some CPU cycle.
Removing change_pte these days isn't a showstopper because KVM page
fault become smarter lately. In the old days lack of change_pte would
mean that the guest would break KSM cow if it ever accessed the page
in readonly (sharable) mode. Back then change_pte was a fundamental
optimization to use KSM and to avoid all KSM pages to be cowed
immediately after being merged.
These days reading from guest memory backed by KSM won't break COW
even if the spte isn't already established before the KVM fault fires
on the KSM memory. change_pte these days has only the benefit of
avoiding a vmexit/vmenter cycle after a the KSM merge and one
vmexit/vmenter cyle after a KSM break COW (the event that triggers if
the guest eventually writes to the page).
KSM merge and KSM cows aren't too frequent operations (and they both
have significant cost associated with them) so it's uncertain if it's
worth keeping the change_pte optimization nowadays. Considering it's
already implemented I personally feel it's worth keeping as a
microoptimization because vmexit/vmenter are certainly more expensive
than calling change_pte.
Both what you suggested above (with enum or two new companion methods)
or the other way Haggai suggested (checking if change_pte is
implemented in all registered mmu notifiers) sounds good to me.
Thanks,
Andrea