Motivation
In the current kernel it is impossible to merge two anonymous VMAs
if one of them was moved. That is because VMA's page offset is
set according to the virtual address where it was created and in
order to merge two VMA's page offsets need to follow up.
Another problem when merging two VMA's is their anon_vma. In
current kernel these anon_vmas have to be the one and the same.
Otherwise merge is again not allowed.
Missed merge opportunities increase the number of VMAs of a process
and in some cases can cause problems when a max count is reached.
Solution
Following series of these patches solves the first problem with
page offsets by updating them when the VMA is moved to a
different virtual address (patch 2). As for the second
problem merging of VMAs with different anon_vma is allowed
(patch 3). Patch 1 refactors function vma_merge and
makes it easier to understand and also allows relatively
seamless tracing of successful merges introduced by the patch 4.
Limitations
For both problems solution works only for VMAs that do not share
physical pages with other processes (usually child or parent
processes). This is checked by looking at anon_vma of the respective
VMA. The reason why it is not possible or at least not easy to
accomplish is that each physical page has a pointer to anon_vma and
page offset. And when this physical page is shared we cannot simply
change these parameters without affecting all of the VMAs mapping
this physical page. Good thing is that this case amounts only for
about 1-3% of all merges (measured for internet browsing and
compilation use cases) that fail to merge in the current kernel.
This series of patches and documentation of the related code will
be part of my master's thesis.
This patch series is based on tag v5.17-rc4.
Jakub Matěna (4):
mm: refactor of vma_merge()
mm: adjust page offset in mremap
mm: enable merging of VMAs with different anon_vmas
mm: add tracing for VMA merges
include/linux/rmap.h | 17 ++-
include/trace/events/mmap.h | 55 +++++++++
mm/internal.h | 11 ++
mm/mmap.c | 232 ++++++++++++++++++++++++++----------
mm/rmap.c | 40 +++++++
5 files changed, 290 insertions(+), 65 deletions(-)
--
2.34.1
Adjust page offset of a VMA when it's moved to a new location by mremap.
This is made possible for all VMAs that do not share their anonymous
pages with other processes. Previously this was possible only for not
yet faulted VMAs.
When the page offset does not correspond to the virtual address
of the anonymous VMA any merge attempt with another VMA will fail.
Signed-off-by: Jakub Matěna <[email protected]>
---
mm/mmap.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 95 insertions(+), 6 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index b55e11f20571..8d253b46b349 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3224,6 +3224,91 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
return 0;
}
+bool rbst_no_children(struct anon_vma *av, struct rb_node *node)
+{
+ struct anon_vma_chain *model;
+ struct anon_vma_chain *avc;
+
+ if (node == NULL) /* leaf node */
+ return true;
+ avc = container_of(node, typeof(*(model)), rb);
+ if (avc->vma->anon_vma != av)
+ /*
+ * Inequality implies avc belongs
+ * to a VMA of a child process
+ */
+ return false;
+ return (rbst_no_children(av, node->rb_left) &&
+ rbst_no_children(av, node->rb_right));
+}
+
+/*
+ * Check if none of the VMAs connected to the given
+ * anon_vma via anon_vma_chain are in child relationship
+ */
+bool rbt_no_children(struct anon_vma *av)
+{
+ struct rb_node *root_node;
+
+ if (av == NULL || av->degree <= 1) /* Higher degree might not necessarily imply children */
+ return true;
+ root_node = av->rb_root.rb_root.rb_node;
+ return rbst_no_children(av, root_node);
+}
+
+/**
+ * update_faulted_pgoff() - Update faulted pages of a vma
+ * @vma: VMA being moved
+ * @addr: new virtual address
+ * @pgoff: pointer to pgoff which is updated
+ * If the vma and its pages are not shared with another process, update
+ * the new pgoff and also update index parameter (copy of the pgoff) in
+ * all faulted pages.
+ */
+bool update_faulted_pgoff(struct vm_area_struct *vma, unsigned long addr, pgoff_t *pgoff)
+{
+ unsigned long pg_iter = 0;
+ unsigned long pg_iters = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+ /* 1.] Check vma is not shared with other processes */
+ if (vma->anon_vma->root != vma->anon_vma || !rbt_no_children(vma->anon_vma))
+ return false;
+
+ /* 2.] Check all pages are not shared */
+ for (; pg_iter < pg_iters; ++pg_iter) {
+ bool pages_not_shared = true;
+ unsigned long shift = pg_iter << PAGE_SHIFT;
+ struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
+
+ if (phys_page == NULL)
+ continue;
+
+ /* Check page is not shared with other processes */
+ if (page_mapcount(phys_page) > 1)
+ pages_not_shared = false;
+ put_page(phys_page);
+ if (!pages_not_shared)
+ return false;
+ }
+
+ /* 3.] Update index in all pages to this new pgoff */
+ pg_iter = 0;
+ *pgoff = addr >> PAGE_SHIFT;
+
+ for (; pg_iter < pg_iters; ++pg_iter) {
+ unsigned long shift = pg_iter << PAGE_SHIFT;
+ struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
+
+ if (phys_page == NULL)
+ continue;
+ lock_page(phys_page);
+ phys_page->index = *pgoff + pg_iter;
+ unlock_page(phys_page);
+ put_page(phys_page);
+ }
+ return true;
+}
+
/*
* Copy the vma structure to a new location in the same mm,
* prior to moving page table entries, to effect an mremap move.
@@ -3237,15 +3322,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
struct rb_node **rb_link, *rb_parent;
- bool faulted_in_anon_vma = true;
+ bool anon_pgoff_updated = false;
/*
- * If anonymous vma has not yet been faulted, update new pgoff
+ * Try to update new pgoff for anonymous vma
* to match new location, to increase its chance of merging.
*/
- if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
- pgoff = addr >> PAGE_SHIFT;
- faulted_in_anon_vma = false;
+ if (unlikely(vma_is_anonymous(vma))) {
+ if (!vma->anon_vma) {
+ pgoff = addr >> PAGE_SHIFT;
+ anon_pgoff_updated = true;
+ } else {
+ anon_pgoff_updated = update_faulted_pgoff(vma, addr, &pgoff);
+ }
}
if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
@@ -3271,7 +3360,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
- VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
+ VM_BUG_ON_VMA(!anon_pgoff_updated, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
--
2.34.1
Enable merging of a VMA even when it is linked to different
anon_vma than the one it is being merged to, but only if the VMA
in question does not share any page with a parent or child process.
Every anonymous page stores a pointer to its anon_vma in the parameter
mapping, which is now updated as part of the merge process.
Signed-off-by: Jakub Matěna <[email protected]>
---
include/linux/rmap.h | 17 ++++++++++++++++-
mm/mmap.c | 15 ++++++++++++++-
mm/rmap.c | 40 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 70 insertions(+), 2 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index e704b1a4c06c..c8508a4ebc46 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -137,10 +137,13 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
*/
void anon_vma_init(void); /* create anon_vma_cachep */
int __anon_vma_prepare(struct vm_area_struct *);
+void reconnect_pages(struct vm_area_struct *vma, struct vm_area_struct *next);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
+bool rbt_no_children(struct anon_vma *av);
+
static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
if (likely(vma->anon_vma))
@@ -149,10 +152,22 @@ static inline int anon_vma_prepare(struct vm_area_struct *vma)
return __anon_vma_prepare(vma);
}
+/**
+ * anon_vma_merge() - Merge anon_vmas of the given VMAs
+ * @vma: VMA being merged to
+ * @next: VMA being merged
+ */
static inline void anon_vma_merge(struct vm_area_struct *vma,
struct vm_area_struct *next)
{
- VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
+ struct anon_vma *anon_vma1 = vma->anon_vma;
+ struct anon_vma *anon_vma2 = next->anon_vma;
+
+ VM_BUG_ON_VMA(anon_vma1 && anon_vma2 && anon_vma1 != anon_vma2 &&
+ ((anon_vma2 != anon_vma2->root)
+ || !rbt_no_children(anon_vma2)), vma);
+
+ reconnect_pages(vma, next);
unlink_anon_vmas(next);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 8d253b46b349..ed91d0cd2111 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,20 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
if ((!anon_vma1 || !anon_vma2) && (!vma ||
list_is_singular(&vma->anon_vma_chain)))
return 1;
- return anon_vma1 == anon_vma2;
+ if (anon_vma1 == anon_vma2)
+ return 1;
+ /*
+ * Different anon_vma but not shared by several processes
+ */
+ else if ((anon_vma1 && anon_vma2) &&
+ (anon_vma1 == anon_vma1->root)
+ && (rbt_no_children(anon_vma1)))
+ return 1;
+ /*
+ * Different anon_vma and shared -> unmergeable
+ */
+ else
+ return 0;
}
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 6a1e8c7f6213..1093b518b0be 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -387,6 +387,46 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
return -ENOMEM;
}
+/**
+ * reconnect_pages() - Reconnect physical pages from old to vma
+ * @vma: VMA to newly contain all physical pages of old
+ * @old: old VMA being merged to vma
+ */
+void reconnect_pages(struct vm_area_struct *vma, struct vm_area_struct *old)
+{
+ struct anon_vma *anon_vma1 = vma->anon_vma;
+ struct anon_vma *anon_vma2 = old->anon_vma;
+ unsigned long pg_iter;
+ int pg_iters;
+
+ if (anon_vma1 == anon_vma2 || anon_vma1 == NULL || anon_vma2 == NULL)
+ return; /* Nothing to do */
+
+ /* Modify page->mapping for all pages in old */
+ pg_iter = 0;
+ pg_iters = (old->vm_end - old->vm_start) >> PAGE_SHIFT;
+
+ for (; pg_iter < pg_iters; ++pg_iter) {
+ /* Get the physical page */
+ unsigned long shift = pg_iter << PAGE_SHIFT;
+ struct page *phys_page = follow_page(old, old->vm_start + shift, FOLL_GET);
+ struct anon_vma *page_anon_vma;
+
+ /* Do some checks and lock the page */
+ if (phys_page == NULL)
+ continue; /* Virtual memory page is not mapped */
+ lock_page(phys_page);
+ page_anon_vma = page_get_anon_vma(phys_page);
+ if (page_anon_vma != NULL) { /* NULL in case of ZERO_PAGE */
+ VM_BUG_ON_VMA(page_anon_vma != old->anon_vma, old);
+ /* Update physical page's mapping */
+ page_move_anon_rmap(phys_page, vma);
+ }
+ unlock_page(phys_page);
+ put_page(phys_page);
+ }
+}
+
void unlink_anon_vmas(struct vm_area_struct *vma)
{
struct anon_vma_chain *avc, *next;
--
2.34.1
Adds trace support for vma_merge to measure successful and unsuccessful
merges of two VMAs with distinct anon_vmas and also trace support for
merges made possible by update of page offset made possible by a previous
patch in this series.
Signed-off-by: Jakub Matěna <[email protected]>
---
include/trace/events/mmap.h | 55 ++++++++++++++++++++++++++++++++
mm/internal.h | 11 +++++++
mm/mmap.c | 63 ++++++++++++++++++++-----------------
3 files changed, 100 insertions(+), 29 deletions(-)
diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
index 4661f7ba07c0..9f6439e2ed2d 100644
--- a/include/trace/events/mmap.h
+++ b/include/trace/events/mmap.h
@@ -6,6 +6,7 @@
#define _TRACE_MMAP_H
#include <linux/tracepoint.h>
+#include <../mm/internal.h>
TRACE_EVENT(vm_unmapped_area,
@@ -42,6 +43,60 @@ TRACE_EVENT(vm_unmapped_area,
__entry->low_limit, __entry->high_limit, __entry->align_mask,
__entry->align_offset)
);
+
+TRACE_EVENT(vm_av_merge,
+
+ TP_PROTO(int merged, enum vma_merge_res merge_prev, enum vma_merge_res merge_next, enum vma_merge_res merge_both),
+
+ TP_ARGS(merged, merge_prev, merge_next, merge_both),
+
+ TP_STRUCT__entry(
+ __field(int, merged)
+ __field(enum vma_merge_res, predecessor_different_av)
+ __field(enum vma_merge_res, successor_different_av)
+ __field(enum vma_merge_res, predecessor_with_successor_different_av)
+ __field(int, diff_count)
+ __field(int, failed_count)
+ ),
+
+ TP_fast_assign(
+ __entry->merged = merged == 0;
+ __entry->predecessor_different_av = merge_prev;
+ __entry->successor_different_av = merge_next;
+ __entry->predecessor_with_successor_different_av = merge_both;
+ __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
+ + (merge_next == AV_MERGE_DIFFERENT) + (merge_both == AV_MERGE_DIFFERENT);
+ __entry->failed_count = (merge_prev == AV_MERGE_FAILED)
+ + (merge_next == AV_MERGE_FAILED) + (merge_both == AV_MERGE_FAILED);
+ ),
+
+ TP_printk("merged=%d predecessor=%d successor=%d predecessor_with_successor=%d diff_count=%d failed_count=%d\n",
+ __entry->merged,
+ __entry->predecessor_different_av, __entry->successor_different_av,
+ __entry->predecessor_with_successor_different_av,
+ __entry->diff_count, __entry->failed_count)
+
+);
+
+TRACE_EVENT(vm_pgoff_merge,
+
+ TP_PROTO(struct vm_area_struct *vma, bool anon_pgoff_updated),
+
+ TP_ARGS(vma, anon_pgoff_updated),
+
+ TP_STRUCT__entry(
+ __field(bool, faulted)
+ __field(bool, updated)
+ ),
+
+ TP_fast_assign(
+ __entry->faulted = vma->anon_vma;
+ __entry->updated = anon_pgoff_updated;
+ ),
+
+ TP_printk("faulted=%d updated=%d\n",
+ __entry->faulted, __entry->updated)
+);
#endif
/* This part must be outside protection */
diff --git a/mm/internal.h b/mm/internal.h
index d80300392a19..b3e482175518 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,6 +34,17 @@ struct folio_batch;
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+/*
+ * Following values indicate reason for merge success or failure.
+ */
+enum vma_merge_res {
+ MERGE_FAILED,
+ AV_MERGE_FAILED,
+ AV_MERGE_SAME,
+ MERGE_OK = AV_MERGE_SAME,
+ AV_MERGE_DIFFERENT,
+};
+
void page_writeback_init(void);
static inline void *folio_raw_mapping(struct folio *folio)
diff --git a/mm/mmap.c b/mm/mmap.c
index ed91d0cd2111..10c76c6a3288 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1064,21 +1064,21 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
*/
if ((!anon_vma1 || !anon_vma2) && (!vma ||
list_is_singular(&vma->anon_vma_chain)))
- return 1;
+ return AV_MERGE_SAME;
if (anon_vma1 == anon_vma2)
- return 1;
+ return AV_MERGE_SAME;
/*
* Different anon_vma but not shared by several processes
*/
else if ((anon_vma1 && anon_vma2) &&
(anon_vma1 == anon_vma1->root)
&& (rbt_no_children(anon_vma1)))
- return 1;
+ return AV_MERGE_DIFFERENT;
/*
* Different anon_vma and shared -> unmergeable
*/
else
- return 0;
+ return AV_MERGE_FAILED;
}
/*
@@ -1099,12 +1099,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name))
if (vma->vm_pgoff == vm_pgoff)
- return 1;
- }
- return 0;
+ return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
+ return MERGE_FAILED;
}
/*
@@ -1121,14 +1119,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
if (vma->vm_pgoff + vm_pglen == vm_pgoff)
- return 1;
+ return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
}
- return 0;
+ return MERGE_FAILED;
}
/*
@@ -1185,9 +1182,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
int err;
- int merge_prev = 0;
- int merge_both = 0;
- int merge_next = 0;
+ /*
+ * Following three variables are used to store values
+ * indicating wheather this VMA and its anon_vma can
+ * be merged and also the type of failure or success.
+ */
+ enum vma_merge_res merge_prev = MERGE_FAILED;
+ enum vma_merge_res merge_both = MERGE_FAILED;
+ enum vma_merge_res merge_next = MERGE_FAILED;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -1210,38 +1212,39 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can we merge predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
- can_vma_merge_after(prev, vm_flags,
+ mpol_equal(vma_policy(prev), policy)) {
+ merge_prev = can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
- vm_userfaultfd_ctx, anon_name)) {
- merge_prev = true;
+ vm_userfaultfd_ctx, anon_name);
}
+
/*
* Can we merge successor?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
+ mpol_equal(policy, vma_policy(next))) {
+ merge_next = can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx, anon_name)) {
- merge_next = true;
+ vm_userfaultfd_ctx, anon_name);
}
+
/*
* Can we merge both predecessor and successor?
*/
- if (merge_prev && merge_next)
+ if (merge_prev >= MERGE_OK && merge_next >= MERGE_OK) {
merge_both = is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL);
+ }
- if (merge_both) { /* cases 1, 6 */
+ if (merge_both >= MERGE_OK) { /* cases 1, 6 */
err = __vma_adjust(prev, prev->vm_start,
next->vm_end, prev->vm_pgoff, NULL,
prev);
area = prev;
- } else if (merge_prev) { /* cases 2, 5, 7 */
+ } else if (merge_prev >= MERGE_OK) { /* cases 2, 5, 7 */
err = __vma_adjust(prev, prev->vm_start,
end, prev->vm_pgoff, NULL, prev);
area = prev;
- } else if (merge_next) {
+ } else if (merge_next >= MERGE_OK) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1252,7 +1255,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
} else {
err = -1;
}
-
+ trace_vm_av_merge(err, merge_prev, merge_next, merge_both);
/*
* Cannot merge with predecessor or successor or error in __vma_adjust?
*/
@@ -3359,6 +3362,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
/*
* Source vma may have been merged into new_vma
*/
+ trace_vm_pgoff_merge(vma, anon_pgoff_updated);
+
if (unlikely(vma_start >= new_vma->vm_start &&
vma_start < new_vma->vm_end)) {
/*
--
2.34.1
On Fri, 18 Feb 2022 18:46:29 +0000
Matthew Wilcox <[email protected]> wrote:
> On Fri, Feb 18, 2022 at 01:09:20PM -0500, Steven Rostedt wrote:
> > Please indent the above better. That is:
> >
> > __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
> > + (merge_next == AV_MERGE_DIFFERENT)
> > + (merge_both == AV_MERGE_DIFFERENT);
>
> I thought our coding style preferred trailing operators; that is:
>
> __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT) +
> (merge_next == AV_MERGE_DIFFERENT) +
> (merge_both == AV_MERGE_DIFFERENT);
I'm OK with either. I just can't handle the original.
-- Steve
* Jakub Matěna <[email protected]> [220218 07:21]:
> Motivation
> In the current kernel it is impossible to merge two anonymous VMAs
> if one of them was moved. That is because VMA's page offset is
> set according to the virtual address where it was created and in
> order to merge two VMA's page offsets need to follow up.
> Another problem when merging two VMA's is their anon_vma. In
> current kernel these anon_vmas have to be the one and the same.
> Otherwise merge is again not allowed.
> Missed merge opportunities increase the number of VMAs of a process
> and in some cases can cause problems when a max count is reached.
Does this really happen that much? Is it worth trying even harder to
merge VMAs? I am not really sure the VMA merging today is worth it - we
are under a lock known to be a bottleneck while examining if it's
possible to merge. Hard data about how often and the cost of merging
would be a good argument to try harder or give up earlier.
>
> Solution
> Following series of these patches solves the first problem with
> page offsets by updating them when the VMA is moved to a
> different virtual address (patch 2). As for the second
> problem merging of VMAs with different anon_vma is allowed
> (patch 3). Patch 1 refactors function vma_merge and
> makes it easier to understand and also allows relatively
> seamless tracing of successful merges introduced by the patch 4.
>
> Limitations
> For both problems solution works only for VMAs that do not share
> physical pages with other processes (usually child or parent
> processes). This is checked by looking at anon_vma of the respective
> VMA. The reason why it is not possible or at least not easy to
> accomplish is that each physical page has a pointer to anon_vma and
> page offset. And when this physical page is shared we cannot simply
> change these parameters without affecting all of the VMAs mapping
> this physical page. Good thing is that this case amounts only for
> about 1-3% of all merges (measured for internet browsing and
> compilation use cases) that fail to merge in the current kernel.
It sounds like you have data for some use cases on the mergers already.
Do you have any results on this change?
>
> This series of patches and documentation of the related code will
> be part of my master's thesis.
> This patch series is based on tag v5.17-rc4.
>
> Jakub Matěna (4):
> mm: refactor of vma_merge()
> mm: adjust page offset in mremap
> mm: enable merging of VMAs with different anon_vmas
> mm: add tracing for VMA merges
>
> include/linux/rmap.h | 17 ++-
> include/trace/events/mmap.h | 55 +++++++++
> mm/internal.h | 11 ++
> mm/mmap.c | 232 ++++++++++++++++++++++++++----------
> mm/rmap.c | 40 +++++++
> 5 files changed, 290 insertions(+), 65 deletions(-)
>
> --
> 2.34.1
>
On Fri, Feb 18, 2022 at 01:09:20PM -0500, Steven Rostedt wrote:
> Please indent the above better. That is:
>
> __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
> + (merge_next == AV_MERGE_DIFFERENT)
> + (merge_both == AV_MERGE_DIFFERENT);
I thought our coding style preferred trailing operators; that is:
__entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT) +
(merge_next == AV_MERGE_DIFFERENT) +
(merge_both == AV_MERGE_DIFFERENT);
* Jakub Matěna <[email protected]> [220218 07:21]:
> Adds trace support for vma_merge to measure successful and unsuccessful
> merges of two VMAs with distinct anon_vmas and also trace support for
> merges made possible by update of page offset made possible by a previous
> patch in this series.
>
> Signed-off-by: Jakub Matěna <[email protected]>
> ---
> include/trace/events/mmap.h | 55 ++++++++++++++++++++++++++++++++
> mm/internal.h | 11 +++++++
> mm/mmap.c | 63 ++++++++++++++++++++-----------------
> 3 files changed, 100 insertions(+), 29 deletions(-)
>
> diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
> index 4661f7ba07c0..9f6439e2ed2d 100644
> --- a/include/trace/events/mmap.h
> +++ b/include/trace/events/mmap.h
> @@ -6,6 +6,7 @@
> #define _TRACE_MMAP_H
>
> #include <linux/tracepoint.h>
> +#include <../mm/internal.h>
>
> TRACE_EVENT(vm_unmapped_area,
>
> @@ -42,6 +43,60 @@ TRACE_EVENT(vm_unmapped_area,
> __entry->low_limit, __entry->high_limit, __entry->align_mask,
> __entry->align_offset)
> );
> +
> +TRACE_EVENT(vm_av_merge,
> +
> + TP_PROTO(int merged, enum vma_merge_res merge_prev, enum vma_merge_res merge_next, enum vma_merge_res merge_both),
> +
> + TP_ARGS(merged, merge_prev, merge_next, merge_both),
> +
> + TP_STRUCT__entry(
> + __field(int, merged)
> + __field(enum vma_merge_res, predecessor_different_av)
> + __field(enum vma_merge_res, successor_different_av)
> + __field(enum vma_merge_res, predecessor_with_successor_different_av)
> + __field(int, diff_count)
> + __field(int, failed_count)
> + ),
> +
> + TP_fast_assign(
> + __entry->merged = merged == 0;
> + __entry->predecessor_different_av = merge_prev;
> + __entry->successor_different_av = merge_next;
> + __entry->predecessor_with_successor_different_av = merge_both;
> + __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
> + + (merge_next == AV_MERGE_DIFFERENT) + (merge_both == AV_MERGE_DIFFERENT);
> + __entry->failed_count = (merge_prev == AV_MERGE_FAILED)
> + + (merge_next == AV_MERGE_FAILED) + (merge_both == AV_MERGE_FAILED);
> + ),
> +
> + TP_printk("merged=%d predecessor=%d successor=%d predecessor_with_successor=%d diff_count=%d failed_count=%d\n",
> + __entry->merged,
> + __entry->predecessor_different_av, __entry->successor_different_av,
> + __entry->predecessor_with_successor_different_av,
> + __entry->diff_count, __entry->failed_count)
> +
> +);
> +
> +TRACE_EVENT(vm_pgoff_merge,
> +
> + TP_PROTO(struct vm_area_struct *vma, bool anon_pgoff_updated),
> +
> + TP_ARGS(vma, anon_pgoff_updated),
> +
> + TP_STRUCT__entry(
> + __field(bool, faulted)
> + __field(bool, updated)
> + ),
> +
> + TP_fast_assign(
> + __entry->faulted = vma->anon_vma;
> + __entry->updated = anon_pgoff_updated;
> + ),
> +
> + TP_printk("faulted=%d updated=%d\n",
> + __entry->faulted, __entry->updated)
> +);
> #endif
>
> /* This part must be outside protection */
> diff --git a/mm/internal.h b/mm/internal.h
> index d80300392a19..b3e482175518 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -34,6 +34,17 @@ struct folio_batch;
> /* Do not use these with a slab allocator */
> #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
>
> +/*
> + * Following values indicate reason for merge success or failure.
> + */
> +enum vma_merge_res {
> + MERGE_FAILED,
> + AV_MERGE_FAILED,
> + AV_MERGE_SAME,
> + MERGE_OK = AV_MERGE_SAME,
> + AV_MERGE_DIFFERENT,
> +};
> +
> void page_writeback_init(void);
>
> static inline void *folio_raw_mapping(struct folio *folio)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index ed91d0cd2111..10c76c6a3288 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1064,21 +1064,21 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
> */
> if ((!anon_vma1 || !anon_vma2) && (!vma ||
> list_is_singular(&vma->anon_vma_chain)))
> - return 1;
> + return AV_MERGE_SAME;
> if (anon_vma1 == anon_vma2)
> - return 1;
> + return AV_MERGE_SAME;
> /*
> * Different anon_vma but not shared by several processes
> */
> else if ((anon_vma1 && anon_vma2) &&
> (anon_vma1 == anon_vma1->root)
> && (rbt_no_children(anon_vma1)))
> - return 1;
> + return AV_MERGE_DIFFERENT;
> /*
> * Different anon_vma and shared -> unmergeable
> */
> else
> - return 0;
> + return AV_MERGE_FAILED;
> }
>
> /*
> @@ -1099,12 +1099,10 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
> struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> const char *anon_name)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name))
> if (vma->vm_pgoff == vm_pgoff)
> - return 1;
> - }
> - return 0;
> + return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
> + return MERGE_FAILED;
> }
>
> /*
> @@ -1121,14 +1119,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
> struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
> const char *anon_name)
> {
> - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
> - is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
> + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name)) {
> pgoff_t vm_pglen;
> vm_pglen = vma_pages(vma);
> if (vma->vm_pgoff + vm_pglen == vm_pgoff)
> - return 1;
> + return is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma);
> }
> - return 0;
> + return MERGE_FAILED;
> }
>
> /*
> @@ -1185,9 +1182,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
> pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
> struct vm_area_struct *area, *next;
> int err;
> - int merge_prev = 0;
> - int merge_both = 0;
> - int merge_next = 0;
> + /*
> + * Following three variables are used to store values
> + * indicating wheather this VMA and its anon_vma can
> + * be merged and also the type of failure or success.
> + */
> + enum vma_merge_res merge_prev = MERGE_FAILED;
> + enum vma_merge_res merge_both = MERGE_FAILED;
> + enum vma_merge_res merge_next = MERGE_FAILED;
>
> /*
> * We later require that vma->vm_flags == vm_flags,
> @@ -1210,38 +1212,39 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
> * Can we merge predecessor?
> */
> if (prev && prev->vm_end == addr &&
> - mpol_equal(vma_policy(prev), policy) &&
> - can_vma_merge_after(prev, vm_flags,
> + mpol_equal(vma_policy(prev), policy)) {
> + merge_prev = can_vma_merge_after(prev, vm_flags,
> anon_vma, file, pgoff,
> - vm_userfaultfd_ctx, anon_name)) {
> - merge_prev = true;
> + vm_userfaultfd_ctx, anon_name);
> }
> +
> /*
> * Can we merge successor?
> */
> if (next && end == next->vm_start &&
> - mpol_equal(policy, vma_policy(next)) &&
> - can_vma_merge_before(next, vm_flags,
> + mpol_equal(policy, vma_policy(next))) {
> + merge_next = can_vma_merge_before(next, vm_flags,
> anon_vma, file, pgoff+pglen,
> - vm_userfaultfd_ctx, anon_name)) {
> - merge_next = true;
> + vm_userfaultfd_ctx, anon_name);
> }
> +
> /*
> * Can we merge both predecessor and successor?
> */
> - if (merge_prev && merge_next)
> + if (merge_prev >= MERGE_OK && merge_next >= MERGE_OK) {
What happened to making vma_merge easier to read? What does > MERGE_OK
mean? I guess this answers why booleans were not used, but I don't like
it. Couldn't you just log the err/success and the value of
merge_prev/merge_next? It's not like the code tries more than one way
of merging on failure..
> merge_both = is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL);
> + }
>
> - if (merge_both) { /* cases 1, 6 */
> + if (merge_both >= MERGE_OK) { /* cases 1, 6 */
> err = __vma_adjust(prev, prev->vm_start,
> next->vm_end, prev->vm_pgoff, NULL,
> prev);
> area = prev;
> - } else if (merge_prev) { /* cases 2, 5, 7 */
> + } else if (merge_prev >= MERGE_OK) { /* cases 2, 5, 7 */
> err = __vma_adjust(prev, prev->vm_start,
> end, prev->vm_pgoff, NULL, prev);
> area = prev;
> - } else if (merge_next) {
> + } else if (merge_next >= MERGE_OK) {
> if (prev && addr < prev->vm_end) /* case 4 */
> err = __vma_adjust(prev, prev->vm_start,
> addr, prev->vm_pgoff, NULL, next);
> @@ -1252,7 +1255,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
> } else {
> err = -1;
> }
> -
> + trace_vm_av_merge(err, merge_prev, merge_next, merge_both);
> /*
> * Cannot merge with predecessor or successor or error in __vma_adjust?
> */
> @@ -3359,6 +3362,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> /*
> * Source vma may have been merged into new_vma
> */
> + trace_vm_pgoff_merge(vma, anon_pgoff_updated);
> +
> if (unlikely(vma_start >= new_vma->vm_start &&
> vma_start < new_vma->vm_end)) {
> /*
> --
> 2.34.1
>
On Fri, 18 Feb 2022 13:20:19 +0100
Jakub Matěna <[email protected]> wrote:
> diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
> index 4661f7ba07c0..9f6439e2ed2d 100644
> --- a/include/trace/events/mmap.h
> +++ b/include/trace/events/mmap.h
> @@ -6,6 +6,7 @@
> #define _TRACE_MMAP_H
>
> #include <linux/tracepoint.h>
> +#include <../mm/internal.h>
>
> TRACE_EVENT(vm_unmapped_area,
>
> @@ -42,6 +43,60 @@ TRACE_EVENT(vm_unmapped_area,
> __entry->low_limit, __entry->high_limit, __entry->align_mask,
> __entry->align_offset)
> );
> +
> +TRACE_EVENT(vm_av_merge,
> +
> + TP_PROTO(int merged, enum vma_merge_res merge_prev, enum vma_merge_res merge_next, enum vma_merge_res merge_both),
> +
> + TP_ARGS(merged, merge_prev, merge_next, merge_both),
> +
> + TP_STRUCT__entry(
> + __field(int, merged)
> + __field(enum vma_merge_res, predecessor_different_av)
> + __field(enum vma_merge_res, successor_different_av)
> + __field(enum vma_merge_res, predecessor_with_successor_different_av)
> + __field(int, diff_count)
> + __field(int, failed_count)
> + ),
> +
> + TP_fast_assign(
> + __entry->merged = merged == 0;
> + __entry->predecessor_different_av = merge_prev;
> + __entry->successor_different_av = merge_next;
> + __entry->predecessor_with_successor_different_av = merge_both;
> + __entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
> + + (merge_next == AV_MERGE_DIFFERENT) + (merge_both == AV_MERGE_DIFFERENT);
> + __entry->failed_count = (merge_prev == AV_MERGE_FAILED)
> + + (merge_next == AV_MERGE_FAILED) + (merge_both == AV_MERGE_FAILED);
Please indent the above better. That is:
__entry->diff_count = (merge_prev == AV_MERGE_DIFFERENT)
+ (merge_next == AV_MERGE_DIFFERENT)
+ (merge_both == AV_MERGE_DIFFERENT);
> + ),
> +
> + TP_printk("merged=%d predecessor=%d successor=%d predecessor_with_successor=%d diff_count=%d failed_count=%d\n",
> + __entry->merged,
> + __entry->predecessor_different_av, __entry->successor_different_av,
> + __entry->predecessor_with_successor_different_av,
> + __entry->diff_count, __entry->failed_count)
To make the above easier to read for humans, you could have at the start:
#define AV_MERGE_TYPES \
EM(MERGE_FAILED), \
EM(AV_MERGE_FAILED) \
EM(MERGE_OK) \
EMe(AV_MERGE_DIFFERENT)
#define EM(a) TRACE_DEFINE_ENUM(a);
#define EMe(a) TRACE_DEFINE_ENUM(a);
AV_MERGE_TYPES
#undef EM
#undef EMe
#define EM(a) {a, #a},
#define EMe(a) {a, #a}
Then:
TP_printk("merged=%d predecessor=%s successor=%s predecessor_with_successor=%s diff_count=%d failed_count=%d",
(note, no "\n", get rid of that)
__entry->merged,
__print_symbolic(predecessor_different_av, AV_MERGE_TYPES),
__print_symbolic(successor_different_av, AV_MERGE_TYPES),
__print_symbolic(predecessor_with_successor_different_av, AV_MERGE_TYPES),
__entry->diff_count, __entry->failed_count)
Then the output will show strings instead of meaningless numbers.
-- Steve
> +
> +);
> +
> +TRACE_EVENT(vm_pgoff_merge,
> +
> + TP_PROTO(struct vm_area_struct *vma, bool anon_pgoff_updated),
> +
> + TP_ARGS(vma, anon_pgoff_updated),
> +
> + TP_STRUCT__entry(
> + __field(bool, faulted)
> + __field(bool, updated)
> + ),
> +
> + TP_fast_assign(
> + __entry->faulted = vma->anon_vma;
> + __entry->updated = anon_pgoff_updated;
> + ),
> +
> + TP_printk("faulted=%d updated=%d\n",
> + __entry->faulted, __entry->updated)
> +);
> #endif
>
> /* This part must be outside protection */
* Jakub Matěna <[email protected]> [220218 07:21]:
> Adjust page offset of a VMA when it's moved to a new location by mremap.
> This is made possible for all VMAs that do not share their anonymous
> pages with other processes. Previously this was possible only for not
> yet faulted VMAs.
> When the page offset does not correspond to the virtual address
> of the anonymous VMA any merge attempt with another VMA will fail.
I don't know enough to fully answer this but I think this may cause
issues with rmap? I would think the anon vma lock is necessary but I
may be missing something.
>
> Signed-off-by: Jakub Matěna <[email protected]>
> ---
> mm/mmap.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 95 insertions(+), 6 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index b55e11f20571..8d253b46b349 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3224,6 +3224,91 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
> return 0;
> }
>
> +bool rbst_no_children(struct anon_vma *av, struct rb_node *node)
> +{
> + struct anon_vma_chain *model;
> + struct anon_vma_chain *avc;
> +
> + if (node == NULL) /* leaf node */
> + return true;
> + avc = container_of(node, typeof(*(model)), rb);
> + if (avc->vma->anon_vma != av)
> + /*
> + * Inequality implies avc belongs
> + * to a VMA of a child process
> + */
> + return false;
> + return (rbst_no_children(av, node->rb_left) &&
> + rbst_no_children(av, node->rb_right));
> +}
> +
> +/*
> + * Check if none of the VMAs connected to the given
> + * anon_vma via anon_vma_chain are in child relationship
> + */
> +bool rbt_no_children(struct anon_vma *av)
> +{
> + struct rb_node *root_node;
> +
> + if (av == NULL || av->degree <= 1) /* Higher degree might not necessarily imply children */
> + return true;
> + root_node = av->rb_root.rb_root.rb_node;
> + return rbst_no_children(av, root_node);
> +}
> +
> +/**
> + * update_faulted_pgoff() - Update faulted pages of a vma
> + * @vma: VMA being moved
> + * @addr: new virtual address
> + * @pgoff: pointer to pgoff which is updated
> + * If the vma and its pages are not shared with another process, update
> + * the new pgoff and also update index parameter (copy of the pgoff) in
> + * all faulted pages.
> + */
> +bool update_faulted_pgoff(struct vm_area_struct *vma, unsigned long addr, pgoff_t *pgoff)
> +{
> + unsigned long pg_iter = 0;
> + unsigned long pg_iters = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> +
> + /* 1.] Check vma is not shared with other processes */
> + if (vma->anon_vma->root != vma->anon_vma || !rbt_no_children(vma->anon_vma))
> + return false;
> +
> + /* 2.] Check all pages are not shared */
> + for (; pg_iter < pg_iters; ++pg_iter) {
> + bool pages_not_shared = true;
> + unsigned long shift = pg_iter << PAGE_SHIFT;
> + struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
> +
> + if (phys_page == NULL)
> + continue;
> +
> + /* Check page is not shared with other processes */
> + if (page_mapcount(phys_page) > 1)
> + pages_not_shared = false;
> + put_page(phys_page);
> + if (!pages_not_shared)
> + return false;
> + }
> +
> + /* 3.] Update index in all pages to this new pgoff */
> + pg_iter = 0;
> + *pgoff = addr >> PAGE_SHIFT;
> +
> + for (; pg_iter < pg_iters; ++pg_iter) {
> + unsigned long shift = pg_iter << PAGE_SHIFT;
> + struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
> +
> + if (phys_page == NULL)
> + continue;
> + lock_page(phys_page);
> + phys_page->index = *pgoff + pg_iter;
> + unlock_page(phys_page);
> + put_page(phys_page);
> + }
> + return true;
> +}
> +
> /*
> * Copy the vma structure to a new location in the same mm,
> * prior to moving page table entries, to effect an mremap move.
> @@ -3237,15 +3322,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> struct mm_struct *mm = vma->vm_mm;
> struct vm_area_struct *new_vma, *prev;
> struct rb_node **rb_link, *rb_parent;
> - bool faulted_in_anon_vma = true;
> + bool anon_pgoff_updated = false;
>
> /*
> - * If anonymous vma has not yet been faulted, update new pgoff
> + * Try to update new pgoff for anonymous vma
> * to match new location, to increase its chance of merging.
> */
> - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> - pgoff = addr >> PAGE_SHIFT;
> - faulted_in_anon_vma = false;
> + if (unlikely(vma_is_anonymous(vma))) {
> + if (!vma->anon_vma) {
> + pgoff = addr >> PAGE_SHIFT;
> + anon_pgoff_updated = true;
> + } else {
> + anon_pgoff_updated = update_faulted_pgoff(vma, addr, &pgoff);
> + }
> }
>
> if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
> @@ -3271,7 +3360,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> * safe. It is only safe to keep the vm_pgoff
> * linear if there are no pages mapped yet.
> */
> - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
> + VM_BUG_ON_VMA(!anon_pgoff_updated, new_vma);
> *vmap = vma = new_vma;
> }
> *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
> --
> 2.34.1
>
* Vlastimil Babka <[email protected]> [220225 05:39]:
> On 2/18/22 20:57, Liam Howlett wrote:
> >> /*
> >> * Can we merge both predecessor and successor?
> >> */
> >> - if (merge_prev && merge_next)
> >> + if (merge_prev >= MERGE_OK && merge_next >= MERGE_OK) {
> >
> > What happened to making vma_merge easier to read? What does > MERGE_OK
> > mean? I guess this answers why booleans were not used, but I don't like
>
> It's similar to e.g. enum compact_priority where specific values are defined
> as well as more abstract aliases.
>
> > it. Couldn't you just log the err/success and the value of
> > merge_prev/merge_next? It's not like the code tries more than one way
> > of merging on failure..
>
> An initial version had the "log" (trace point really) at multiple places and
> it was uglier than collecting details in the variables and having a single
> tracepoint call site.
Wouldn't the success/failure, merge_prev, and merge_next be all we need
to know what happened? This could be places near the end of the
function. It doesn't say why merge_prev or merge_next was set to false
but I think that's enough for most uses?
>
> Note that the tracepoint is being provided as part of the series mainly to
> allow evaluation of the series. If it's deemed too specific to be included
> in mainline in the end, so be it.
Ah, in that case use bool until we arrive at this patch, then change
merge_* to int.
>
> >> merge_both = is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL);
> >> + }
> >>
> >> - if (merge_both) { /* cases 1, 6 */
> >> + if (merge_both >= MERGE_OK) { /* cases 1, 6 */
> >> err = __vma_adjust(prev, prev->vm_start,
> >> next->vm_end, prev->vm_pgoff, NULL,
> >> prev);
> >> area = prev;
> >> - } else if (merge_prev) { /* cases 2, 5, 7 */
> >> + } else if (merge_prev >= MERGE_OK) { /* cases 2, 5, 7 */
> >> err = __vma_adjust(prev, prev->vm_start,
> >> end, prev->vm_pgoff, NULL, prev);
> >> area = prev;
> >> - } else if (merge_next) {
> >> + } else if (merge_next >= MERGE_OK) {
> >> if (prev && addr < prev->vm_end) /* case 4 */
> >> err = __vma_adjust(prev, prev->vm_start,
> >> addr, prev->vm_pgoff, NULL, next);
> >> @@ -1252,7 +1255,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
> >> } else {
> >> err = -1;
> >> }
> >> -
> >> + trace_vm_av_merge(err, merge_prev, merge_next, merge_both);
> >> /*
> >> * Cannot merge with predecessor or successor or error in __vma_adjust?
> >> */
> >> @@ -3359,6 +3362,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> >> /*
> >> * Source vma may have been merged into new_vma
> >> */
> >> + trace_vm_pgoff_merge(vma, anon_pgoff_updated);
> >> +
> >> if (unlikely(vma_start >= new_vma->vm_start &&
> >> vma_start < new_vma->vm_end)) {
> >> /*
> >> --
> >> 2.34.1
> >>
>
* Vlastimil Babka <[email protected]> [220225 05:31]:
> On 2/18/22 20:21, Liam Howlett wrote:
> > * Jakub Matěna <[email protected]> [220218 07:21]:
> >> Motivation
> >> In the current kernel it is impossible to merge two anonymous VMAs
> >> if one of them was moved. That is because VMA's page offset is
> >> set according to the virtual address where it was created and in
> >> order to merge two VMA's page offsets need to follow up.
> >> Another problem when merging two VMA's is their anon_vma. In
> >> current kernel these anon_vmas have to be the one and the same.
> >> Otherwise merge is again not allowed.
> >> Missed merge opportunities increase the number of VMAs of a process
> >> and in some cases can cause problems when a max count is reached.
> >
> > Does this really happen that much? Is it worth trying even harder to
>
> Let me perhaps clarify. Maybe not in general, but some mremap() heavy
> workloads fragment VMA space a lot, have to increase the vma limits etc.
> While the original motivation was a proprietary workload, there are e.g.
> allocators such as jemalloc that rely on mremap().
>
> But yes, it might turn out that the benefit is not universal and we might
> consider some ways to make more aggressive merging opt-in.
>
> > merge VMAs? I am not really sure the VMA merging today is worth it - we
> > are under a lock known to be a bottleneck while examining if it's
>
> I'd be afraid that by scaling back existing merging would break some
> userspace expectations inspecting e.g. /proc/pid/maps
Is that a risk considering how many things stop the merging of VMAs? We
just added another (names). Not all the information can be in
/proc/pid/maps - otherwise the tracing patch wouldn't really be
necessary?
>
> > possible to merge. Hard data about how often and the cost of merging
> > would be a good argument to try harder or give up earlier.
> >
> >>
> >> Solution
> >> Following series of these patches solves the first problem with
> >> page offsets by updating them when the VMA is moved to a
> >> different virtual address (patch 2). As for the second
> >> problem merging of VMAs with different anon_vma is allowed
> >> (patch 3). Patch 1 refactors function vma_merge and
> >> makes it easier to understand and also allows relatively
> >> seamless tracing of successful merges introduced by the patch 4.
> >>
> >> Limitations
> >> For both problems solution works only for VMAs that do not share
> >> physical pages with other processes (usually child or parent
> >> processes). This is checked by looking at anon_vma of the respective
> >> VMA. The reason why it is not possible or at least not easy to
> >> accomplish is that each physical page has a pointer to anon_vma and
> >> page offset. And when this physical page is shared we cannot simply
> >> change these parameters without affecting all of the VMAs mapping
> >> this physical page. Good thing is that this case amounts only for
> >> about 1-3% of all merges (measured for internet browsing and
> >> compilation use cases) that fail to merge in the current kernel.
> >
> > It sounds like you have data for some use cases on the mergers already.
> > Do you have any results on this change?
> >
> >>
> >> This series of patches and documentation of the related code will
> >> be part of my master's thesis.
> >> This patch series is based on tag v5.17-rc4.
> >>
> >> Jakub Matěna (4):
> >> mm: refactor of vma_merge()
> >> mm: adjust page offset in mremap
> >> mm: enable merging of VMAs with different anon_vmas
> >> mm: add tracing for VMA merges
> >>
> >> include/linux/rmap.h | 17 ++-
> >> include/trace/events/mmap.h | 55 +++++++++
> >> mm/internal.h | 11 ++
> >> mm/mmap.c | 232 ++++++++++++++++++++++++++----------
> >> mm/rmap.c | 40 +++++++
> >> 5 files changed, 290 insertions(+), 65 deletions(-)
> >>
> >> --
> >> 2.34.1
> >>
>
On 2/18/22 20:57, Liam Howlett wrote:
>> /*
>> * Can we merge both predecessor and successor?
>> */
>> - if (merge_prev && merge_next)
>> + if (merge_prev >= MERGE_OK && merge_next >= MERGE_OK) {
>
> What happened to making vma_merge easier to read? What does > MERGE_OK
> mean? I guess this answers why booleans were not used, but I don't like
It's similar to e.g. enum compact_priority where specific values are defined
as well as more abstract aliases.
> it. Couldn't you just log the err/success and the value of
> merge_prev/merge_next? It's not like the code tries more than one way
> of merging on failure..
An initial version had the "log" (trace point really) at multiple places and
it was uglier than collecting details in the variables and having a single
tracepoint call site.
Note that the tracepoint is being provided as part of the series mainly to
allow evaluation of the series. If it's deemed too specific to be included
in mainline in the end, so be it.
>> merge_both = is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL);
>> + }
>>
>> - if (merge_both) { /* cases 1, 6 */
>> + if (merge_both >= MERGE_OK) { /* cases 1, 6 */
>> err = __vma_adjust(prev, prev->vm_start,
>> next->vm_end, prev->vm_pgoff, NULL,
>> prev);
>> area = prev;
>> - } else if (merge_prev) { /* cases 2, 5, 7 */
>> + } else if (merge_prev >= MERGE_OK) { /* cases 2, 5, 7 */
>> err = __vma_adjust(prev, prev->vm_start,
>> end, prev->vm_pgoff, NULL, prev);
>> area = prev;
>> - } else if (merge_next) {
>> + } else if (merge_next >= MERGE_OK) {
>> if (prev && addr < prev->vm_end) /* case 4 */
>> err = __vma_adjust(prev, prev->vm_start,
>> addr, prev->vm_pgoff, NULL, next);
>> @@ -1252,7 +1255,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
>> } else {
>> err = -1;
>> }
>> -
>> + trace_vm_av_merge(err, merge_prev, merge_next, merge_both);
>> /*
>> * Cannot merge with predecessor or successor or error in __vma_adjust?
>> */
>> @@ -3359,6 +3362,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
>> /*
>> * Source vma may have been merged into new_vma
>> */
>> + trace_vm_pgoff_merge(vma, anon_pgoff_updated);
>> +
>> if (unlikely(vma_start >= new_vma->vm_start &&
>> vma_start < new_vma->vm_end)) {
>> /*
>> --
>> 2.34.1
>>
On 2/18/22 20:21, Liam Howlett wrote:
> * Jakub Matěna <[email protected]> [220218 07:21]:
>> Motivation
>> In the current kernel it is impossible to merge two anonymous VMAs
>> if one of them was moved. That is because VMA's page offset is
>> set according to the virtual address where it was created and in
>> order to merge two VMA's page offsets need to follow up.
>> Another problem when merging two VMA's is their anon_vma. In
>> current kernel these anon_vmas have to be the one and the same.
>> Otherwise merge is again not allowed.
>> Missed merge opportunities increase the number of VMAs of a process
>> and in some cases can cause problems when a max count is reached.
>
> Does this really happen that much? Is it worth trying even harder to
Let me perhaps clarify. Maybe not in general, but some mremap() heavy
workloads fragment VMA space a lot, have to increase the vma limits etc.
While the original motivation was a proprietary workload, there are e.g.
allocators such as jemalloc that rely on mremap().
But yes, it might turn out that the benefit is not universal and we might
consider some ways to make more aggressive merging opt-in.
> merge VMAs? I am not really sure the VMA merging today is worth it - we
> are under a lock known to be a bottleneck while examining if it's
I'd be afraid that by scaling back existing merging would break some
userspace expectations inspecting e.g. /proc/pid/maps
> possible to merge. Hard data about how often and the cost of merging
> would be a good argument to try harder or give up earlier.
>
>>
>> Solution
>> Following series of these patches solves the first problem with
>> page offsets by updating them when the VMA is moved to a
>> different virtual address (patch 2). As for the second
>> problem merging of VMAs with different anon_vma is allowed
>> (patch 3). Patch 1 refactors function vma_merge and
>> makes it easier to understand and also allows relatively
>> seamless tracing of successful merges introduced by the patch 4.
>>
>> Limitations
>> For both problems solution works only for VMAs that do not share
>> physical pages with other processes (usually child or parent
>> processes). This is checked by looking at anon_vma of the respective
>> VMA. The reason why it is not possible or at least not easy to
>> accomplish is that each physical page has a pointer to anon_vma and
>> page offset. And when this physical page is shared we cannot simply
>> change these parameters without affecting all of the VMAs mapping
>> this physical page. Good thing is that this case amounts only for
>> about 1-3% of all merges (measured for internet browsing and
>> compilation use cases) that fail to merge in the current kernel.
>
> It sounds like you have data for some use cases on the mergers already.
> Do you have any results on this change?
>
>>
>> This series of patches and documentation of the related code will
>> be part of my master's thesis.
>> This patch series is based on tag v5.17-rc4.
>>
>> Jakub Matěna (4):
>> mm: refactor of vma_merge()
>> mm: adjust page offset in mremap
>> mm: enable merging of VMAs with different anon_vmas
>> mm: add tracing for VMA merges
>>
>> include/linux/rmap.h | 17 ++-
>> include/trace/events/mmap.h | 55 +++++++++
>> mm/internal.h | 11 ++
>> mm/mmap.c | 232 ++++++++++++++++++++++++++----------
>> mm/rmap.c | 40 +++++++
>> 5 files changed, 290 insertions(+), 65 deletions(-)
>>
>> --
>> 2.34.1
>>
On Fri, Feb 18, 2022 at 8:50 PM Liam Howlett <[email protected]> wrote:
>
> * Jakub Matěna <[email protected]> [220218 07:21]:
> > Adjust page offset of a VMA when it's moved to a new location by mremap.
> > This is made possible for all VMAs that do not share their anonymous
> > pages with other processes. Previously this was possible only for not
> > yet faulted VMAs.
> > When the page offset does not correspond to the virtual address
> > of the anonymous VMA any merge attempt with another VMA will fail.
>
> I don't know enough to fully answer this but I think this may cause
> issues with rmap? I would think the anon vma lock is necessary but I
> may be missing something.
Lock ordering for memory management is specified at the beginning of
mm/rmap.c file and implies that in order to lock anon_vma->rwsem, you
first have to lock mm->mmap_lock. mm->mmap_lock is locked in mremap
syscall via mmap_write_lock_killable(). Moreover in the newly
introduced function update_faulted_pgoff() there is a condition that
checks if the vma does not share some physical pages with other
processes. And therefore other processes shouldn't interfere with the
locks either.
>
> >
> > Signed-off-by: Jakub Matěna <[email protected]>
> > ---
> > mm/mmap.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 95 insertions(+), 6 deletions(-)
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index b55e11f20571..8d253b46b349 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -3224,6 +3224,91 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
> > return 0;
> > }
> >
> > +bool rbst_no_children(struct anon_vma *av, struct rb_node *node)
> > +{
> > + struct anon_vma_chain *model;
> > + struct anon_vma_chain *avc;
> > +
> > + if (node == NULL) /* leaf node */
> > + return true;
> > + avc = container_of(node, typeof(*(model)), rb);
> > + if (avc->vma->anon_vma != av)
> > + /*
> > + * Inequality implies avc belongs
> > + * to a VMA of a child process
> > + */
> > + return false;
> > + return (rbst_no_children(av, node->rb_left) &&
> > + rbst_no_children(av, node->rb_right));
> > +}
> > +
> > +/*
> > + * Check if none of the VMAs connected to the given
> > + * anon_vma via anon_vma_chain are in child relationship
> > + */
> > +bool rbt_no_children(struct anon_vma *av)
> > +{
> > + struct rb_node *root_node;
> > +
> > + if (av == NULL || av->degree <= 1) /* Higher degree might not necessarily imply children */
> > + return true;
> > + root_node = av->rb_root.rb_root.rb_node;
> > + return rbst_no_children(av, root_node);
> > +}
> > +
> > +/**
> > + * update_faulted_pgoff() - Update faulted pages of a vma
> > + * @vma: VMA being moved
> > + * @addr: new virtual address
> > + * @pgoff: pointer to pgoff which is updated
> > + * If the vma and its pages are not shared with another process, update
> > + * the new pgoff and also update index parameter (copy of the pgoff) in
> > + * all faulted pages.
> > + */
> > +bool update_faulted_pgoff(struct vm_area_struct *vma, unsigned long addr, pgoff_t *pgoff)
> > +{
> > + unsigned long pg_iter = 0;
> > + unsigned long pg_iters = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> > +
> > + /* 1.] Check vma is not shared with other processes */
> > + if (vma->anon_vma->root != vma->anon_vma || !rbt_no_children(vma->anon_vma))
> > + return false;
> > +
> > + /* 2.] Check all pages are not shared */
> > + for (; pg_iter < pg_iters; ++pg_iter) {
> > + bool pages_not_shared = true;
> > + unsigned long shift = pg_iter << PAGE_SHIFT;
> > + struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
> > +
> > + if (phys_page == NULL)
> > + continue;
> > +
> > + /* Check page is not shared with other processes */
> > + if (page_mapcount(phys_page) > 1)
> > + pages_not_shared = false;
> > + put_page(phys_page);
> > + if (!pages_not_shared)
> > + return false;
> > + }
> > +
> > + /* 3.] Update index in all pages to this new pgoff */
> > + pg_iter = 0;
> > + *pgoff = addr >> PAGE_SHIFT;
> > +
> > + for (; pg_iter < pg_iters; ++pg_iter) {
> > + unsigned long shift = pg_iter << PAGE_SHIFT;
> > + struct page *phys_page = follow_page(vma, vma->vm_start + shift, FOLL_GET);
> > +
> > + if (phys_page == NULL)
> > + continue;
> > + lock_page(phys_page);
> > + phys_page->index = *pgoff + pg_iter;
> > + unlock_page(phys_page);
> > + put_page(phys_page);
> > + }
> > + return true;
> > +}
> > +
> > /*
> > * Copy the vma structure to a new location in the same mm,
> > * prior to moving page table entries, to effect an mremap move.
> > @@ -3237,15 +3322,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > struct mm_struct *mm = vma->vm_mm;
> > struct vm_area_struct *new_vma, *prev;
> > struct rb_node **rb_link, *rb_parent;
> > - bool faulted_in_anon_vma = true;
> > + bool anon_pgoff_updated = false;
> >
> > /*
> > - * If anonymous vma has not yet been faulted, update new pgoff
> > + * Try to update new pgoff for anonymous vma
> > * to match new location, to increase its chance of merging.
> > */
> > - if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> > - pgoff = addr >> PAGE_SHIFT;
> > - faulted_in_anon_vma = false;
> > + if (unlikely(vma_is_anonymous(vma))) {
> > + if (!vma->anon_vma) {
> > + pgoff = addr >> PAGE_SHIFT;
> > + anon_pgoff_updated = true;
> > + } else {
> > + anon_pgoff_updated = update_faulted_pgoff(vma, addr, &pgoff);
> > + }
> > }
> >
> > if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
> > @@ -3271,7 +3360,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> > * safe. It is only safe to keep the vm_pgoff
> > * linear if there are no pages mapped yet.
> > */
> > - VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
> > + VM_BUG_ON_VMA(!anon_pgoff_updated, new_vma);
> > *vmap = vma = new_vma;
> > }
> > *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
> > --
> > 2.34.1
> >