2022-02-22 06:04:48

by Suren Baghdasaryan

[permalink] [raw]
Subject: [PATCH 1/3] mm: refactor vm_area_struct::anon_vma_name usage code

Avoid mixing strings and their anon_vma_name referenced pointers
by using struct anon_vma_name whenever possible. This simplifies
the code and allows easier sharing of anon_vma_name structures when
they represent the same name.

Suggested-by: Matthew Wilcox <[email protected]>
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
fs/proc/task_mmu.c | 4 +-
include/linux/mm.h | 7 +--
include/linux/mm_inline.h | 75 ++++++++++++++++++----------
include/linux/mm_types.h | 5 +-
kernel/sys.c | 18 +++++--
mm/madvise.c | 100 ++++++++++++++++----------------------
mm/mmap.c | 10 ++--
7 files changed, 118 insertions(+), 101 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6e97ed775074..5bfcf24493ac 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)

name = arch_vma_name(vma);
if (!name) {
- const char *anon_name;
+ struct anon_vma_name *anon_name;

if (!mm) {
name = "[vdso]";
@@ -330,7 +330,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
anon_name = vma_anon_name(vma);
if (anon_name) {
seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name);
+ seq_printf(m, "[anon:%s]", anon_name->name);
}
}

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 213cc569b192..5744a3fc4716 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2626,7 +2626,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
+ struct mempolicy *, struct vm_userfaultfd_ctx, struct anon_vma_name *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
@@ -3372,11 +3372,12 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)

#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name);
+ unsigned long len_in,
+ struct anon_vma_name *anon_name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name) {
+ unsigned long len_in, struct anon_vma_name *anon_name) {
return 0;
}
#endif
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index b725839dfe71..70b619442d56 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -140,50 +140,73 @@ static __always_inline void del_page_from_lru_list(struct page *page,

#ifdef CONFIG_ANON_VMA_NAME
/*
- * mmap_lock should be read-locked when calling vma_anon_name() and while using
- * the returned pointer.
+ * mmap_lock should be read-locked when calling vma_anon_name(). Caller should
+ * either keep holding the lock while using the returned pointer or it should
+ * raise anon_vma_name refcount before releasing the lock.
*/
-extern const char *vma_anon_name(struct vm_area_struct *vma);
+extern struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma);
+extern struct anon_vma_name *anon_vma_name_alloc(const char *name);

-/*
- * mmap_lock should be read-locked for orig_vma->vm_mm.
- * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
- * isolated.
- */
-extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma);
+/* mmap_lock should be read-locked */
+static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
+{
+ if (anon_name)
+ kref_get(&anon_name->kref);
+}

-/*
- * mmap_lock should be write-locked or vma should have been isolated under
- * write-locked mmap_lock protection.
- */
-extern void free_vma_anon_name(struct vm_area_struct *vma);
+extern void anon_vma_name_put(struct anon_vma_name *anon_name);

-/* mmap_lock should be read-locked */
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- const char *name)
+static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
{
- const char *vma_name = vma_anon_name(vma);
+ struct anon_vma_name *anon_name = vma_anon_name(orig_vma);

- /* either both NULL, or pointers to same string */
- if (vma_name == name)
+ if (anon_name) {
+ anon_vma_name_get(anon_name);
+ new_vma->anon_name = anon_name;
+ }
+}
+
+static inline void free_vma_anon_name(struct vm_area_struct *vma)
+{
+ /* Can't use vma_anon_name because vma->vm_mm might not be held */
+ if (!vma->vm_file)
+ anon_vma_name_put(vma->anon_name);
+}
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ struct anon_vma_name *anon_name2)
+{
+ if (anon_name1 == anon_name2)
return true;

- return name && vma_name && !strcmp(name, vma_name);
+ return anon_name1 && anon_name2 &&
+ !strcmp(anon_name1->name, anon_name2->name);
}
+
#else /* CONFIG_ANON_VMA_NAME */
-static inline const char *vma_anon_name(struct vm_area_struct *vma)
+static inline struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
{
return NULL;
}
+
+static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ return NULL;
+}
+
+static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
+static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma) {}
+ struct vm_area_struct *new_vma) {}
static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
-static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
- const char *name)
+
+static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
+ struct anon_vma_name *anon_name2)
{
return true;
}
+
#endif /* CONFIG_ANON_VMA_NAME */

static inline void init_tlb_flush_pending(struct mm_struct *mm)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5140e5feb486..1d298ff2e1d0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -416,7 +416,10 @@ struct vm_area_struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
- /* Serialized by mmap_sem. */
+ /*
+ * Serialized by mmap_sem. Never use directly because it is
+ * valid only when vm_file is NULL. Use vma_anon_name instead.
+ */
struct anon_vma_name *anon_name;
};

diff --git a/kernel/sys.c b/kernel/sys.c
index ecc4cf019242..60c3f9eac9eb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -7,6 +7,7 @@

#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
@@ -2278,15 +2279,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
{
struct mm_struct *mm = current->mm;
const char __user *uname;
- char *name, *pch;
+ struct anon_vma_name *anon_name;
int error;

switch (opt) {
case PR_SET_VMA_ANON_NAME:
uname = (const char __user *)arg;
if (uname) {
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ char *name, *pch;

+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
if (IS_ERR(name))
return PTR_ERR(name);

@@ -2296,15 +2298,21 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return -EINVAL;
}
}
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+
} else {
/* Reset the name */
- name = NULL;
+ anon_name = NULL;
}

mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, name);
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
mmap_write_unlock(mm);
- kfree(name);
+ anon_vma_name_put(anon_name);
break;
default:
error = -EINVAL;
diff --git a/mm/madvise.c b/mm/madvise.c
index 5604064df464..f81d62d8ce9b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,11 +65,14 @@ static int madvise_need_mmap_write(int behavior)
}

#ifdef CONFIG_ANON_VMA_NAME
-static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;

+ if (!name)
+ return NULL;
+
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
@@ -81,78 +84,55 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name)
return anon_name;
}

-static void vma_anon_name_free(struct kref *kref)
+struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
{
- struct anon_vma_name *anon_name =
- container_of(kref, struct anon_vma_name, kref);
- kfree(anon_name);
-}
-
-static inline bool has_vma_anon_name(struct vm_area_struct *vma)
-{
- return !vma->vm_file && vma->anon_name;
-}
+ mmap_assert_locked(vma->vm_mm);

-const char *vma_anon_name(struct vm_area_struct *vma)
-{
- if (!has_vma_anon_name(vma))
+ if (vma->vm_file)
return NULL;

- mmap_assert_locked(vma->vm_mm);
-
- return vma->anon_name->name;
+ return vma->anon_name;
}

-void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma)
+static void vma_anon_name_free(struct kref *kref)
{
- if (!has_vma_anon_name(orig_vma))
- return;
-
- kref_get(&orig_vma->anon_name->kref);
- new_vma->anon_name = orig_vma->anon_name;
+ struct anon_vma_name *anon_name =
+ container_of(kref, struct anon_vma_name, kref);
+ kfree(anon_name);
}

-void free_vma_anon_name(struct vm_area_struct *vma)
+void anon_vma_name_put(struct anon_vma_name *anon_name)
{
- struct anon_vma_name *anon_name;
-
- if (!has_vma_anon_name(vma))
- return;
-
- anon_name = vma->anon_name;
- vma->anon_name = NULL;
- kref_put(&anon_name->kref, vma_anon_name_free);
+ if (anon_name)
+ kref_put(&anon_name->kref, vma_anon_name_free);
}

/* mmap_lock should be write-locked */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_vma_anon_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- const char *anon_name;
+ struct anon_vma_name *orig_name = vma_anon_name(vma);

- if (!name) {
- free_vma_anon_name(vma);
+ if (!anon_name) {
+ vma->anon_name = NULL;
+ anon_vma_name_put(orig_name);
return 0;
}

- anon_name = vma_anon_name(vma);
- if (anon_name) {
- /* Same name, nothing to do here */
- if (!strcmp(name, anon_name))
- return 0;
+ if (anon_vma_name_eq(orig_name, anon_name))
+ return 0;

- free_vma_anon_name(vma);
- }
- vma->anon_name = anon_vma_name_alloc(name);
- if (!vma->anon_name)
- return -ENOMEM;
+ anon_vma_name_get(anon_name);
+ vma->anon_name = anon_name;
+ anon_vma_name_put(orig_name);

return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_vma_anon_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- if (name)
+ if (anon_name)
return -EINVAL;

return 0;
@@ -165,13 +145,13 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
- const char *name)
+ struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;

- if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(vma_anon_name(vma), anon_name)) {
*prev = vma;
return 0;
}
@@ -179,7 +159,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, name);
+ vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@@ -209,7 +189,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
- error = replace_vma_anon_name(vma, name);
+ error = replace_vma_anon_name(vma, anon_name);
if (error)
return error;
}
@@ -976,6 +956,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
{
int error;
unsigned long new_flags = vma->vm_flags;
+ struct anon_vma_name *anon_name;

switch (behavior) {
case MADV_REMOVE:
@@ -1040,8 +1021,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}

+ anon_name = vma_anon_name(vma);
error = madvise_update_vma(vma, prev, start, end, new_flags,
- vma_anon_name(vma));
+ anon_name);

out:
/*
@@ -1225,7 +1207,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long name)
+ unsigned long anon_name)
{
int error;

@@ -1234,7 +1216,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;

error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (const char *)name);
+ (struct anon_vma_name *)anon_name);

/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1246,7 +1228,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
}

int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name)
+ unsigned long len_in, struct anon_vma_name *anon_name)
{
unsigned long end;
unsigned long len;
@@ -1266,8 +1248,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;

- return madvise_walk_vmas(mm, start, end, (unsigned long)name,
- madvise_vma_anon_name);
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51ed..80d2ae204a98 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1031,7 +1031,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
- if (!is_same_vma_anon_name(vma, anon_name))
+ if (!anon_vma_name_eq(vma_anon_name(vma), anon_name))
return 0;
return 1;
}
@@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
--
2.35.1.473.g83b2b277ed-goog


2022-02-22 06:04:50

by Suren Baghdasaryan

[permalink] [raw]
Subject: [PATCH 2/3] mm: prevent vm_area_struct::anon_name refcount saturation

A deep process chain with many vmas could grow really high. kref
refcounting interface used in anon_vma_name structure will detect
a counter overflow when it reaches REFCOUNT_SATURATED value but will
only generate a warning about broken refcounting.
To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
sharing when the refcount reaches INT_MAX, which still leaves INT_MAX/2
values before the counter reaches REFCOUNT_SATURATED. This should provide
enough headroom for raising the refcounts temporarily.

Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
include/linux/mm_inline.h | 18 ++++++++++++++----
mm/madvise.c | 3 +--
2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 70b619442d56..b189e2638843 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -156,15 +156,25 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)

extern void anon_vma_name_put(struct anon_vma_name *anon_name);

+static inline
+struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
+{
+ /* Prevent anon_name refcount saturation early on */
+ if (kref_read(&anon_name->kref) < INT_MAX) {
+ anon_vma_name_get(anon_name);
+ return anon_name;
+
+ }
+ return anon_vma_name_alloc(anon_name->name);
+}
+
static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma)
{
struct anon_vma_name *anon_name = vma_anon_name(orig_vma);

- if (anon_name) {
- anon_vma_name_get(anon_name);
- new_vma->anon_name = anon_name;
- }
+ if (anon_name)
+ new_vma->anon_name = anon_vma_name_reuse(anon_name);
}

static inline void free_vma_anon_name(struct vm_area_struct *vma)
diff --git a/mm/madvise.c b/mm/madvise.c
index f81d62d8ce9b..a395884aeecb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -122,8 +122,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
if (anon_vma_name_eq(orig_name, anon_name))
return 0;

- anon_vma_name_get(anon_name);
- vma->anon_name = anon_name;
+ vma->anon_name = anon_vma_name_reuse(anon_name);
anon_vma_name_put(orig_name);

return 0;
--
2.35.1.473.g83b2b277ed-goog

2022-02-22 06:04:54

by Suren Baghdasaryan

[permalink] [raw]
Subject: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

When adjacent vmas are being merged it can result in the vma that was
originally passed to madvise_update_vma being destroyed. In the current
implementation, the name parameter passed to madvise_update_vma points
directly to vma->anon_name->name and it is used after the call to
vma_merge. In the cases when vma_merge merges the original vma and
destroys it, this will result in use-after-free bug as shown below:

madvise_vma_behavior << passes vma->anon_name->name as name param
madvise_update_vma(name)
vma_merge
__vma_adjust
vm_area_free <-- frees the vma
replace_vma_anon_name(name) <-- UAF

Fix this by raising the name refcount and stabilizing it.

Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
Signed-off-by: Suren Baghdasaryan <[email protected]>
Reported-by: [email protected]
---
changes in v3:
- Reapplied the fix after code refactoring, per Michal Hocko

mm/madvise.c | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/mm/madvise.c b/mm/madvise.c
index a395884aeecb..00e8105430e9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,8 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
+ * Caller should ensure anon_name stability by raising its refcount even when
+ * anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
@@ -1021,8 +1023,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
}

anon_name = vma_anon_name(vma);
+ anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
anon_name);
+ anon_vma_name_put(anon_name);

out:
/*
--
2.35.1.473.g83b2b277ed-goog

2022-02-22 06:07:42

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Mon, Feb 21, 2022 at 9:40 PM Suren Baghdasaryan <[email protected]> wrote:
>
> When adjacent vmas are being merged it can result in the vma that was
> originally passed to madvise_update_vma being destroyed. In the current
> implementation, the name parameter passed to madvise_update_vma points
> directly to vma->anon_name->name and it is used after the call to
> vma_merge. In the cases when vma_merge merges the original vma and
> destroys it, this will result in use-after-free bug as shown below:
>
> madvise_vma_behavior << passes vma->anon_name->name as name param
> madvise_update_vma(name)
> vma_merge
> __vma_adjust
> vm_area_free <-- frees the vma
> replace_vma_anon_name(name) <-- UAF
>
> Fix this by raising the name refcount and stabilizing it.
>
> Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> Reported-by: [email protected]
> ---
> changes in v3:
> - Reapplied the fix after code refactoring, per Michal Hocko

Hi Andrew,
Since I needed to make some refactoring before adding this fix, in
order to apply this new version to mmotm you would need to revert the
previous version of this patch from your tree:
0cc16837d264 ("mm: fix use-after-free when anon vma name is used after
vma is freed")
and then apply the whole patchset (3 patches) after it is reviewed.
Sorry for the inconvenience but I think this way the refactoring and
the fix would be in the right order and with no overlap.
The patchset applies cleanly to Linus' ToT and to mmotm after
0cc16837d264 is reverted.
Thanks,
Suren.

>
> mm/madvise.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index a395884aeecb..00e8105430e9 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -140,6 +140,8 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> /*
> * Update the vm_flags on region of a vma, splitting it or merging it as
> * necessary. Must be called with mmap_sem held for writing;
> + * Caller should ensure anon_name stability by raising its refcount even when
> + * anon_name belongs to a valid vma because this function might free that vma.
> */
> static int madvise_update_vma(struct vm_area_struct *vma,
> struct vm_area_struct **prev, unsigned long start,
> @@ -1021,8 +1023,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> }
>
> anon_name = vma_anon_name(vma);
> + anon_vma_name_get(anon_name);
> error = madvise_update_vma(vma, prev, start, end, new_flags,
> anon_name);
> + anon_vma_name_put(anon_name);
>
> out:
> /*
> --
> 2.35.1.473.g83b2b277ed-goog
>

2022-02-22 08:10:14

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> When adjacent vmas are being merged it can result in the vma that was
> originally passed to madvise_update_vma being destroyed. In the current
> implementation, the name parameter passed to madvise_update_vma points
> directly to vma->anon_name->name and it is used after the call to
> vma_merge. In the cases when vma_merge merges the original vma and
> destroys it, this will result in use-after-free bug as shown below:
>
> madvise_vma_behavior << passes vma->anon_name->name as name param
> madvise_update_vma(name)
> vma_merge
> __vma_adjust
> vm_area_free <-- frees the vma
> replace_vma_anon_name(name) <-- UAF

This seems to be stale because bare const char pointer is not passed in
the call chain. In fact I am not even sure there is any actual UAF here
after the rework.
Could you be more specific in describing the scenario?

> Fix this by raising the name refcount and stabilizing it.
>
> Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> Reported-by: [email protected]
> ---
> changes in v3:
> - Reapplied the fix after code refactoring, per Michal Hocko
>
> mm/madvise.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index a395884aeecb..00e8105430e9 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -140,6 +140,8 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> /*
> * Update the vm_flags on region of a vma, splitting it or merging it as
> * necessary. Must be called with mmap_sem held for writing;
> + * Caller should ensure anon_name stability by raising its refcount even when
> + * anon_name belongs to a valid vma because this function might free that vma.
> */
> static int madvise_update_vma(struct vm_area_struct *vma,
> struct vm_area_struct **prev, unsigned long start,
> @@ -1021,8 +1023,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> }
>
> anon_name = vma_anon_name(vma);
> + anon_vma_name_get(anon_name);
> error = madvise_update_vma(vma, prev, start, end, new_flags,
> anon_name);
> + anon_vma_name_put(anon_name);
>
> out:
> /*
> --
> 2.35.1.473.g83b2b277ed-goog

--
Michal Hocko
SUSE Labs

2022-02-22 09:08:56

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: refactor vm_area_struct::anon_vma_name usage code

On Mon 21-02-22 21:40:23, Suren Baghdasaryan wrote:
> Avoid mixing strings and their anon_vma_name referenced pointers
> by using struct anon_vma_name whenever possible. This simplifies
> the code and allows easier sharing of anon_vma_name structures when
> they represent the same name.

I just diffed this to my half baked proposal to see for changes. Let me
comment on those as this will be very likely easier to review than
looking at the new code.

> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 17c20597e244..70b619442d56 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -140,61 +140,73 @@ static __always_inline void del_page_from_lru_list(struct page *page,
>
> #ifdef CONFIG_ANON_VMA_NAME
> /*
> - * mmap_lock should be read-locked when calling vma_anon_name() and while using
> - * the returned pointer.
> + * mmap_lock should be read-locked when calling vma_anon_name(). Caller should
> + * either keep holding the lock while using the returned pointer or it should
> + * raise anon_vma_name refcount before releasing the lock.
> */
> -extern const char *vma_anon_name(struct vm_area_struct *vma);
> +extern struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma);

OK, I can see some reason to force final users of the name to dig it out
of the struct but I would recommend unifying the naming. vma_anon_name
makes sense if you are providing the real name. But for all functions
which operate on anon_vma_name I would stick with anon_vma_$FOO.
You seem to be inconsistent in that regards. E.g. dup_vma_anon_name
but anon_vma_name_{get,put}. I do not really care one way or the other
but please make then consistent.

> +extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
>
> -static inline void anon_vma_name_get(struct anon_vma_name *name)
> +/* mmap_lock should be read-locked */
> +static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
> {
> - if (!name)
> - return;
> -
> - kref_get(&name->kref);
> + if (anon_name)
> + kref_get(&anon_name->kref);
> }
>
> -void vma_anon_name_free(struct kref *kref);
> -static inline void anon_vma_name_put(struct anon_vma_name *name)
> -{
> - if (!name)
> - return;
> -
> - kref_put(&name->kref, vma_anon_name_free);
> -}
> +extern void anon_vma_name_put(struct anon_vma_name *anon_name);

I would keep get and put close together. It is just easier to follow the
code that way. But no strong preference here.

> -static inline bool anon_vma_name_eq(struct anon_vma_name *name1, struct anon_vma_name *name2)
> +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,

dup_anon_vma_name

> static inline void free_vma_anon_name(struct vm_area_struct *vma)

free_anon_vma_name

> {
> - anon_vma_name_put(vma->anon_name);
> + /* Can't use vma_anon_name because vma->vm_mm might not be held */

I do not follow. I guess you meant to say that mmap_lock might not be
held but why is that important in this context when the vma goes away?
Nobody can be copying anon_name from it?

> + if (!vma->vm_file)
> + anon_vma_name_put(vma->anon_name);
> }

[...]
> diff --git a/mm/madvise.c b/mm/madvise.c
> index f2f8065f67c1..f81d62d8ce9b 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -70,6 +70,9 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
> struct anon_vma_name *anon_name;
> size_t count;
>
> + if (!name)
> + return NULL;
> +

This is not really needed.

> /* Add 1 for NUL terminator at the end of the anon_name->name */
> count = strlen(name) + 1;
> anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);

[...]
> @@ -947,6 +956,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> {
> int error;
> unsigned long new_flags = vma->vm_flags;
> + struct anon_vma_name *anon_name;
>
> switch (behavior) {
> case MADV_REMOVE:
> @@ -1011,8 +1021,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> break;
> }
>
> + anon_name = vma_anon_name(vma);
> error = madvise_update_vma(vma, prev, start, end, new_flags,
> - vma->anon_name);
> + anon_name);
>
> out:

You can use vma_anon_name directly, no need to add 2 more lines of code
;)

> @@ -1237,8 +1248,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> if (end == start)
> return 0;
>
> - return madvise_walk_vmas(mm, start, end, (unsigned long)name,
> - madvise_vma_anon_name);
> + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
> + madvise_vma_anon_name);

whitespace noise

Other that that nothing else really jumped at me. Here is a diff I
propose on top of this patch. Mostly naming unification and you will get
8 less lines in the end ;)
---
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5bfcf24493ac..2c48b1eaaa9c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -327,7 +327,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}

- anon_name = vma_anon_name(vma);
+ anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
seq_printf(m, "[anon:%s]", anon_name->name);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e26b10132d47..8e03b3d3f5fa 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev)
vma = prev;
else
@@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 70b619442d56..60b48e4a5560 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -140,11 +140,11 @@ static __always_inline void del_page_from_lru_list(struct page *page,

#ifdef CONFIG_ANON_VMA_NAME
/*
- * mmap_lock should be read-locked when calling vma_anon_name(). Caller should
+ * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
* either keep holding the lock while using the returned pointer or it should
* raise anon_vma_name refcount before releasing the lock.
*/
-extern struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma);
+extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
extern struct anon_vma_name *anon_vma_name_alloc(const char *name);

/* mmap_lock should be read-locked */
@@ -156,10 +156,10 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)

extern void anon_vma_name_put(struct anon_vma_name *anon_name);

-static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma)
{
- struct anon_vma_name *anon_name = vma_anon_name(orig_vma);
+ struct anon_vma_name *anon_name = anon_vma_name(orig_vma);

if (anon_name) {
anon_vma_name_get(anon_name);
@@ -167,7 +167,7 @@ static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
}
}

-static inline void free_vma_anon_name(struct vm_area_struct *vma)
+static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
/* Can't use vma_anon_name because vma->vm_mm might not be held */
if (!vma->vm_file)
@@ -185,7 +185,7 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
}

#else /* CONFIG_ANON_VMA_NAME */
-static inline struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
+static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
return NULL;
}
@@ -197,9 +197,9 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)

static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
-static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma) {}
-static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
+static inline void free_anon_vma_name(struct vm_area_struct *vma) {}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
struct anon_vma_name *anon_name2)
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..e7dc5c29212c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
- dup_vma_anon_name(orig, new);
+ dup_anon_vma_name(orig, new);
}
return new;
}

void vm_area_free(struct vm_area_struct *vma)
{
- free_vma_anon_name(vma);
+ free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}

diff --git a/kernel/sys.c b/kernel/sys.c
index 60c3f9eac9eb..c150aa335eeb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2279,7 +2279,7 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
{
struct mm_struct *mm = current->mm;
const char __user *uname;
- struct anon_vma_name *anon_name;
+ struct anon_vma_name *anon_name = NULL;
int error;

switch (opt) {
@@ -2304,9 +2304,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
if (!anon_name)
return -ENOMEM;

- } else {
- /* Reset the name */
- anon_name = NULL;
}

mmap_write_lock(mm);
diff --git a/mm/madvise.c b/mm/madvise.c
index f81d62d8ce9b..f23c66d15bd5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -70,9 +70,6 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
struct anon_vma_name *anon_name;
size_t count;

- if (!name)
- return NULL;
-
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
@@ -84,7 +81,7 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
return anon_name;
}

-struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
mmap_assert_locked(vma->vm_mm);

@@ -108,10 +105,10 @@ void anon_vma_name_put(struct anon_vma_name *anon_name)
}

/* mmap_lock should be write-locked */
-static int replace_vma_anon_name(struct vm_area_struct *vma,
+static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
- struct anon_vma_name *orig_name = vma_anon_name(vma);
+ struct anon_vma_name *orig_name = anon_vma_name(vma);

if (!anon_name) {
vma->anon_name = NULL;
@@ -129,7 +126,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
-static int replace_vma_anon_name(struct vm_area_struct *vma,
+static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
if (anon_name)
@@ -151,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
int error;
pgoff_t pgoff;

- if (new_flags == vma->vm_flags && anon_vma_name_eq(vma_anon_name(vma), anon_name)) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
return 0;
}
@@ -189,7 +186,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
- error = replace_vma_anon_name(vma, anon_name);
+ error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
@@ -956,7 +953,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
{
int error;
unsigned long new_flags = vma->vm_flags;
- struct anon_vma_name *anon_name;

switch (behavior) {
case MADV_REMOVE:
@@ -1021,9 +1017,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}

- anon_name = vma_anon_name(vma);
error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_name);
+ anon_vma_name(vma));

out:
/*
@@ -1248,8 +1243,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;

- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
- madvise_vma_anon_name);
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 028e8dd82b44..69284d3b5e53 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f584eddd305..25934e7db3e1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 80d2ae204a98..ad6a1fffee91 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
- if (!anon_vma_name_eq(vma_anon_name(vma), anon_name))
+ if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
return 0;
return 1;
}
@@ -3255,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0138dfcdb1d8..9fce6860cdab 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
--
Michal Hocko
SUSE Labs

2022-02-22 09:48:56

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm: prevent vm_area_struct::anon_name refcount saturation

On Mon 21-02-22 21:40:24, Suren Baghdasaryan wrote:
> A deep process chain with many vmas could grow really high.

This would really benefit from some numbers. With default
sysctl_max_map_count (64k) and default pid_max (32k) the INT_MAX could
be theoretically reached but I find it impractical because not all vmas
can be anonymous same as all available pids can be consumed for a
theoretical attack (if my counting is proper).
On the other hand any non-default configuration with any of the values
increased could hit this theoretically.

> kref
> refcounting interface used in anon_vma_name structure will detect
> a counter overflow when it reaches REFCOUNT_SATURATED value but will
> only generate a warning about broken refcounting.
> To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
> sharing when the refcount reaches INT_MAX, which still leaves INT_MAX/2
> values before the counter reaches REFCOUNT_SATURATED. This should provide
> enough headroom for raising the refcounts temporarily.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Suren Baghdasaryan <[email protected]>
> ---
> include/linux/mm_inline.h | 18 ++++++++++++++----
> mm/madvise.c | 3 +--
> 2 files changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 70b619442d56..b189e2638843 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -156,15 +156,25 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
>
> extern void anon_vma_name_put(struct anon_vma_name *anon_name);
>
> +static inline
> +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
> +{
> + /* Prevent anon_name refcount saturation early on */
> + if (kref_read(&anon_name->kref) < INT_MAX) {

REFCOUNT_MAX seems to be defined by the kref framework.

Other than that looks good to me.

> + anon_vma_name_get(anon_name);
> + return anon_name;
> +
> + }
> + return anon_vma_name_alloc(anon_name->name);
> +}
> +
> static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> struct vm_area_struct *new_vma)
> {
> struct anon_vma_name *anon_name = vma_anon_name(orig_vma);
>
> - if (anon_name) {
> - anon_vma_name_get(anon_name);
> - new_vma->anon_name = anon_name;
> - }
> + if (anon_name)
> + new_vma->anon_name = anon_vma_name_reuse(anon_name);
> }
>
> static inline void free_vma_anon_name(struct vm_area_struct *vma)
> diff --git a/mm/madvise.c b/mm/madvise.c
> index f81d62d8ce9b..a395884aeecb 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -122,8 +122,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> if (anon_vma_name_eq(orig_name, anon_name))
> return 0;
>
> - anon_vma_name_get(anon_name);
> - vma->anon_name = anon_name;
> + vma->anon_name = anon_vma_name_reuse(anon_name);
> anon_vma_name_put(orig_name);
>
> return 0;
> --
> 2.35.1.473.g83b2b277ed-goog

--
Michal Hocko
SUSE Labs

2022-02-22 15:53:15

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Tue, Feb 22, 2022 at 12:06 AM Michal Hocko <[email protected]> wrote:
>
> On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> > When adjacent vmas are being merged it can result in the vma that was
> > originally passed to madvise_update_vma being destroyed. In the current
> > implementation, the name parameter passed to madvise_update_vma points
> > directly to vma->anon_name->name and it is used after the call to
> > vma_merge. In the cases when vma_merge merges the original vma and
> > destroys it, this will result in use-after-free bug as shown below:
> >
> > madvise_vma_behavior << passes vma->anon_name->name as name param
> > madvise_update_vma(name)
> > vma_merge
> > __vma_adjust
> > vm_area_free <-- frees the vma
> > replace_vma_anon_name(name) <-- UAF
>
> This seems to be stale because bare const char pointer is not passed in
> the call chain. In fact I am not even sure there is any actual UAF here
> after the rework.
> Could you be more specific in describing the scenario?

Yes, sorry, I need to update the part of the description talking about
passing vma->anon_name->name directly.
I think UAF is still there, it's just harder to reproduce (admittedly
I could not reproduce it with the previous reproducer). The scenario
would be when a vma with vma->anon_name->kref == 1 is being merged
with another one and freed in the process:

madvise_vma_behavior
anon_name = vma_anon_name(vma) <-- does not increase refcount
madvise_update_vma(anon_name)
*prev = vma_merge <-- returns another vma
__vma_adjust
vm_area_free(vma)
free_vma_anon_name
anon_vma_name_put
vma_anon_name_free <-- frees the vma->anon_name
vma = *prev <-- original vma was freed
replace_vma_anon_name(vma, >>anon_name<<) <-- UAF

Does this make sense or did I miss something?
Thanks,
Suren.

>
> > Fix this by raising the name refcount and stabilizing it.
> >
> > Fixes: 9a10064f5625 ("mm: add a field to store names for private anonymous memory")
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > Reported-by: [email protected]
> > ---
> > changes in v3:
> > - Reapplied the fix after code refactoring, per Michal Hocko
> >
> > mm/madvise.c | 4 ++++
> > 1 file changed, 4 insertions(+)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index a395884aeecb..00e8105430e9 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -140,6 +140,8 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> > /*
> > * Update the vm_flags on region of a vma, splitting it or merging it as
> > * necessary. Must be called with mmap_sem held for writing;
> > + * Caller should ensure anon_name stability by raising its refcount even when
> > + * anon_name belongs to a valid vma because this function might free that vma.
> > */
> > static int madvise_update_vma(struct vm_area_struct *vma,
> > struct vm_area_struct **prev, unsigned long start,
> > @@ -1021,8 +1023,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> > }
> >
> > anon_name = vma_anon_name(vma);
> > + anon_vma_name_get(anon_name);
> > error = madvise_update_vma(vma, prev, start, end, new_flags,
> > anon_name);
> > + anon_vma_name_put(anon_name);
> >
> > out:
> > /*
> > --
> > 2.35.1.473.g83b2b277ed-goog
>
> --
> Michal Hocko
> SUSE Labs

2022-02-22 23:54:49

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 1/3] mm: refactor vm_area_struct::anon_vma_name usage code

On Tue, Feb 22, 2022 at 12:51 AM Michal Hocko <[email protected]> wrote:
>
> On Mon 21-02-22 21:40:23, Suren Baghdasaryan wrote:
> > Avoid mixing strings and their anon_vma_name referenced pointers
> > by using struct anon_vma_name whenever possible. This simplifies
> > the code and allows easier sharing of anon_vma_name structures when
> > they represent the same name.
>
> I just diffed this to my half baked proposal to see for changes. Let me
> comment on those as this will be very likely easier to review than
> looking at the new code.

Thanks for the effort!

>
> > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> > index 17c20597e244..70b619442d56 100644
> > --- a/include/linux/mm_inline.h
> > +++ b/include/linux/mm_inline.h
> > @@ -140,61 +140,73 @@ static __always_inline void del_page_from_lru_list(struct page *page,
> >
> > #ifdef CONFIG_ANON_VMA_NAME
> > /*
> > - * mmap_lock should be read-locked when calling vma_anon_name() and while using
> > - * the returned pointer.
> > + * mmap_lock should be read-locked when calling vma_anon_name(). Caller should
> > + * either keep holding the lock while using the returned pointer or it should
> > + * raise anon_vma_name refcount before releasing the lock.
> > */
> > -extern const char *vma_anon_name(struct vm_area_struct *vma);
> > +extern struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma);
>
> OK, I can see some reason to force final users of the name to dig it out
> of the struct but I would recommend unifying the naming. vma_anon_name
> makes sense if you are providing the real name. But for all functions
> which operate on anon_vma_name I would stick with anon_vma_$FOO.
> You seem to be inconsistent in that regards. E.g. dup_vma_anon_name
> but anon_vma_name_{get,put}. I do not really care one way or the other
> but please make then consistent.

Ack. Will change the names the way you suggested.

>
> > +extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
> >
> > -static inline void anon_vma_name_get(struct anon_vma_name *name)
> > +/* mmap_lock should be read-locked */
> > +static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
> > {
> > - if (!name)
> > - return;
> > -
> > - kref_get(&name->kref);
> > + if (anon_name)
> > + kref_get(&anon_name->kref);
> > }
> >
> > -void vma_anon_name_free(struct kref *kref);
> > -static inline void anon_vma_name_put(struct anon_vma_name *name)
> > -{
> > - if (!name)
> > - return;
> > -
> > - kref_put(&name->kref, vma_anon_name_free);
> > -}
> > +extern void anon_vma_name_put(struct anon_vma_name *anon_name);
>
> I would keep get and put close together. It is just easier to follow the
> code that way. But no strong preference here.

Ack.

>
> > -static inline bool anon_vma_name_eq(struct anon_vma_name *name1, struct anon_vma_name *name2)
> > +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
>
> dup_anon_vma_name
>
> > static inline void free_vma_anon_name(struct vm_area_struct *vma)
>
> free_anon_vma_name
>
> > {
> > - anon_vma_name_put(vma->anon_name);
> > + /* Can't use vma_anon_name because vma->vm_mm might not be held */
>
> I do not follow. I guess you meant to say that mmap_lock might not be
> held but why is that important in this context when the vma goes away?
> Nobody can be copying anon_name from it?

I wanted to explain why I can't use vma_anon_name and have to check
vma->vm_file directly instead. vma_anon_name has
mmap_assert_locked(vma->vm_mm) and will generate a warning if called
without mmap_lock being held. I will write a more descriptive comment
here.

>
> > + if (!vma->vm_file)
> > + anon_vma_name_put(vma->anon_name);
> > }
>
> [...]
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index f2f8065f67c1..f81d62d8ce9b 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -70,6 +70,9 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
> > struct anon_vma_name *anon_name;
> > size_t count;
> >
> > + if (!name)
> > + return NULL;
> > +
>
> This is not really needed.

True. Will remove.

>
> > /* Add 1 for NUL terminator at the end of the anon_name->name */
> > count = strlen(name) + 1;
> > anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
>
> [...]
> > @@ -947,6 +956,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> > {
> > int error;
> > unsigned long new_flags = vma->vm_flags;
> > + struct anon_vma_name *anon_name;
> >
> > switch (behavior) {
> > case MADV_REMOVE:
> > @@ -1011,8 +1021,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> > break;
> > }
> >
> > + anon_name = vma_anon_name(vma);
> > error = madvise_update_vma(vma, prev, start, end, new_flags,
> > - vma->anon_name);
> > + anon_name);
> >
> > out:
>
> You can use vma_anon_name directly, no need to add 2 more lines of code
> ;)

Ack.

>
> > @@ -1237,8 +1248,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> > if (end == start)
> > return 0;
> >
> > - return madvise_walk_vmas(mm, start, end, (unsigned long)name,
> > - madvise_vma_anon_name);
> > + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
> > + madvise_vma_anon_name);
>
> whitespace noise

Ack.

>
> Other that that nothing else really jumped at me. Here is a diff I
> propose on top of this patch. Mostly naming unification and you will get
> 8 less lines in the end ;)

Thanks. Will be incorporated into the next revision.

> ---
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 5bfcf24493ac..2c48b1eaaa9c 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -327,7 +327,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
> goto done;
> }
>
> - anon_name = vma_anon_name(vma);
> + anon_name = anon_vma_name(vma);
> if (anon_name) {
> seq_pad(m, ' ');
> seq_printf(m, "[anon:%s]", anon_name->name);
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index e26b10132d47..8e03b3d3f5fa 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -878,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
> new_flags, vma->anon_vma,
> vma->vm_file, vma->vm_pgoff,
> vma_policy(vma),
> - NULL_VM_UFFD_CTX, vma_anon_name(vma));
> + NULL_VM_UFFD_CTX, anon_vma_name(vma));
> if (prev)
> vma = prev;
> else
> @@ -1438,7 +1438,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
> vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> vma_policy(vma),
> ((struct vm_userfaultfd_ctx){ ctx }),
> - vma_anon_name(vma));
> + anon_vma_name(vma));
> if (prev) {
> vma = prev;
> goto next;
> @@ -1615,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
> prev = vma_merge(mm, prev, start, vma_end, new_flags,
> vma->anon_vma, vma->vm_file, vma->vm_pgoff,
> vma_policy(vma),
> - NULL_VM_UFFD_CTX, vma_anon_name(vma));
> + NULL_VM_UFFD_CTX, anon_vma_name(vma));
> if (prev) {
> vma = prev;
> goto next;
> diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> index 70b619442d56..60b48e4a5560 100644
> --- a/include/linux/mm_inline.h
> +++ b/include/linux/mm_inline.h
> @@ -140,11 +140,11 @@ static __always_inline void del_page_from_lru_list(struct page *page,
>
> #ifdef CONFIG_ANON_VMA_NAME
> /*
> - * mmap_lock should be read-locked when calling vma_anon_name(). Caller should
> + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
> * either keep holding the lock while using the returned pointer or it should
> * raise anon_vma_name refcount before releasing the lock.
> */
> -extern struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma);
> +extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
> extern struct anon_vma_name *anon_vma_name_alloc(const char *name);
>
> /* mmap_lock should be read-locked */
> @@ -156,10 +156,10 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
>
> extern void anon_vma_name_put(struct anon_vma_name *anon_name);
>
> -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> struct vm_area_struct *new_vma)
> {
> - struct anon_vma_name *anon_name = vma_anon_name(orig_vma);
> + struct anon_vma_name *anon_name = anon_vma_name(orig_vma);
>
> if (anon_name) {
> anon_vma_name_get(anon_name);
> @@ -167,7 +167,7 @@ static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> }
> }
>
> -static inline void free_vma_anon_name(struct vm_area_struct *vma)
> +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> {
> /* Can't use vma_anon_name because vma->vm_mm might not be held */
> if (!vma->vm_file)
> @@ -185,7 +185,7 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
> }
>
> #else /* CONFIG_ANON_VMA_NAME */
> -static inline struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
> +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
> {
> return NULL;
> }
> @@ -197,9 +197,9 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
>
> static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
> static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
> -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> struct vm_area_struct *new_vma) {}
> -static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
> +static inline void free_anon_vma_name(struct vm_area_struct *vma) {}
>
> static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
> struct anon_vma_name *anon_name2)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d75a528f7b21..e7dc5c29212c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -366,14 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> *new = data_race(*orig);
> INIT_LIST_HEAD(&new->anon_vma_chain);
> new->vm_next = new->vm_prev = NULL;
> - dup_vma_anon_name(orig, new);
> + dup_anon_vma_name(orig, new);
> }
> return new;
> }
>
> void vm_area_free(struct vm_area_struct *vma)
> {
> - free_vma_anon_name(vma);
> + free_anon_vma_name(vma);
> kmem_cache_free(vm_area_cachep, vma);
> }
>
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 60c3f9eac9eb..c150aa335eeb 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2279,7 +2279,7 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
> {
> struct mm_struct *mm = current->mm;
> const char __user *uname;
> - struct anon_vma_name *anon_name;
> + struct anon_vma_name *anon_name = NULL;
> int error;
>
> switch (opt) {
> @@ -2304,9 +2304,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
> if (!anon_name)
> return -ENOMEM;
>
> - } else {
> - /* Reset the name */
> - anon_name = NULL;
> }
>
> mmap_write_lock(mm);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index f81d62d8ce9b..f23c66d15bd5 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -70,9 +70,6 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
> struct anon_vma_name *anon_name;
> size_t count;
>
> - if (!name)
> - return NULL;
> -
> /* Add 1 for NUL terminator at the end of the anon_name->name */
> count = strlen(name) + 1;
> anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
> @@ -84,7 +81,7 @@ struct anon_vma_name *anon_vma_name_alloc(const char *name)
> return anon_name;
> }
>
> -struct anon_vma_name *vma_anon_name(struct vm_area_struct *vma)
> +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
> {
> mmap_assert_locked(vma->vm_mm);
>
> @@ -108,10 +105,10 @@ void anon_vma_name_put(struct anon_vma_name *anon_name)
> }
>
> /* mmap_lock should be write-locked */
> -static int replace_vma_anon_name(struct vm_area_struct *vma,
> +static int replace_anon_vma_name(struct vm_area_struct *vma,
> struct anon_vma_name *anon_name)
> {
> - struct anon_vma_name *orig_name = vma_anon_name(vma);
> + struct anon_vma_name *orig_name = anon_vma_name(vma);
>
> if (!anon_name) {
> vma->anon_name = NULL;
> @@ -129,7 +126,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> return 0;
> }
> #else /* CONFIG_ANON_VMA_NAME */
> -static int replace_vma_anon_name(struct vm_area_struct *vma,
> +static int replace_anon_vma_name(struct vm_area_struct *vma,
> struct anon_vma_name *anon_name)
> {
> if (anon_name)
> @@ -151,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> int error;
> pgoff_t pgoff;
>
> - if (new_flags == vma->vm_flags && anon_vma_name_eq(vma_anon_name(vma), anon_name)) {
> + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
> *prev = vma;
> return 0;
> }
> @@ -189,7 +186,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
> */
> vma->vm_flags = new_flags;
> if (!vma->vm_file) {
> - error = replace_vma_anon_name(vma, anon_name);
> + error = replace_anon_vma_name(vma, anon_name);
> if (error)
> return error;
> }
> @@ -956,7 +953,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> {
> int error;
> unsigned long new_flags = vma->vm_flags;
> - struct anon_vma_name *anon_name;
>
> switch (behavior) {
> case MADV_REMOVE:
> @@ -1021,9 +1017,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> break;
> }
>
> - anon_name = vma_anon_name(vma);
> error = madvise_update_vma(vma, prev, start, end, new_flags,
> - anon_name);
> + anon_vma_name(vma));
>
> out:
> /*
> @@ -1248,8 +1243,8 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
> if (end == start)
> return 0;
>
> - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
> - madvise_vma_anon_name);
> + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
> + madvise_vma_anon_name);
> }
> #endif /* CONFIG_ANON_VMA_NAME */
> /*
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 028e8dd82b44..69284d3b5e53 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
> prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
> vma->anon_vma, vma->vm_file, pgoff,
> new_pol, vma->vm_userfaultfd_ctx,
> - vma_anon_name(vma));
> + anon_vma_name(vma));
> if (prev) {
> vma = prev;
> next = vma->vm_next;
> diff --git a/mm/mlock.c b/mm/mlock.c
> index 8f584eddd305..25934e7db3e1 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
> pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
> vma->vm_file, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, vma_anon_name(vma));
> + vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> if (*prev) {
> vma = *prev;
> goto success;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 80d2ae204a98..ad6a1fffee91 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
> return 0;
> if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
> return 0;
> - if (!anon_vma_name_eq(vma_anon_name(vma), anon_name))
> + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
> return 0;
> return 1;
> }
> @@ -3255,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> return NULL; /* should never get here */
> new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
> vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, vma_anon_name(vma));
> + vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> if (new_vma) {
> /*
> * Source vma may have been merged into new_vma
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 0138dfcdb1d8..9fce6860cdab 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
> pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
> *pprev = vma_merge(mm, *pprev, start, end, newflags,
> vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
> - vma->vm_userfaultfd_ctx, vma_anon_name(vma));
> + vma->vm_userfaultfd_ctx, anon_vma_name(vma));
> if (*pprev) {
> vma = *pprev;
> VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
> --
> Michal Hocko
> SUSE Labs

2022-02-23 03:59:29

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm: prevent vm_area_struct::anon_name refcount saturation

On Tue, Feb 22, 2022 at 7:56 AM Suren Baghdasaryan <[email protected]> wrote:
>
> On Tue, Feb 22, 2022 at 1:17 AM Michal Hocko <[email protected]> wrote:
> >
> > On Mon 21-02-22 21:40:24, Suren Baghdasaryan wrote:
> > > A deep process chain with many vmas could grow really high.
> >
> > This would really benefit from some numbers. With default
> > sysctl_max_map_count (64k) and default pid_max (32k) the INT_MAX could
> > be theoretically reached but I find it impractical because not all vmas
> > can be anonymous same as all available pids can be consumed for a
> > theoretical attack (if my counting is proper).
> > On the other hand any non-default configuration with any of the values
> > increased could hit this theoretically.
>
> re: This would really benefit from some numbers
> Should I just add the details you provided above into the description?
> Would that suffice?

Hmm. According to the defaults you posted, with max number of
processes being 32k and max number of vmas per process 64k, the max
number of vmas in the system is 2147450880. That's 32767 less than
REFCOUNT_MAX=INT_MAX (2147483647) and 1073774592 less than
REFCOUNT_SATURATED (3221225472). So with those defaults we should
never hit these limits. Are we adding this protection for systems that
set non-default higher limits or am I miscalculating something?

>
> >
> > > kref
> > > refcounting interface used in anon_vma_name structure will detect
> > > a counter overflow when it reaches REFCOUNT_SATURATED value but will
> > > only generate a warning about broken refcounting.
> > > To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
> > > sharing when the refcount reaches INT_MAX, which still leaves INT_MAX/2
> > > values before the counter reaches REFCOUNT_SATURATED. This should provide
> > > enough headroom for raising the refcounts temporarily.
> > >
> > > Suggested-by: Michal Hocko <[email protected]>
> > > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > > ---
> > > include/linux/mm_inline.h | 18 ++++++++++++++----
> > > mm/madvise.c | 3 +--
> > > 2 files changed, 15 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> > > index 70b619442d56..b189e2638843 100644
> > > --- a/include/linux/mm_inline.h
> > > +++ b/include/linux/mm_inline.h
> > > @@ -156,15 +156,25 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
> > >
> > > extern void anon_vma_name_put(struct anon_vma_name *anon_name);
> > >
> > > +static inline
> > > +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
> > > +{
> > > + /* Prevent anon_name refcount saturation early on */
> > > + if (kref_read(&anon_name->kref) < INT_MAX) {
> >
> > REFCOUNT_MAX seems to be defined by the kref framework.
>
> Ah, indeed. I missed that. Will change to use it.
>
> >
> > Other than that looks good to me.
>
> Thanks for the review!
>
> >
> > > + anon_vma_name_get(anon_name);
> > > + return anon_name;
> > > +
> > > + }
> > > + return anon_vma_name_alloc(anon_name->name);
> > > +}
> > > +
> > > static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> > > struct vm_area_struct *new_vma)
> > > {
> > > struct anon_vma_name *anon_name = vma_anon_name(orig_vma);
> > >
> > > - if (anon_name) {
> > > - anon_vma_name_get(anon_name);
> > > - new_vma->anon_name = anon_name;
> > > - }
> > > + if (anon_name)
> > > + new_vma->anon_name = anon_vma_name_reuse(anon_name);
> > > }
> > >
> > > static inline void free_vma_anon_name(struct vm_area_struct *vma)
> > > diff --git a/mm/madvise.c b/mm/madvise.c
> > > index f81d62d8ce9b..a395884aeecb 100644
> > > --- a/mm/madvise.c
> > > +++ b/mm/madvise.c
> > > @@ -122,8 +122,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> > > if (anon_vma_name_eq(orig_name, anon_name))
> > > return 0;
> > >
> > > - anon_vma_name_get(anon_name);
> > > - vma->anon_name = anon_name;
> > > + vma->anon_name = anon_vma_name_reuse(anon_name);
> > > anon_vma_name_put(orig_name);
> > >
> > > return 0;
> > > --
> > > 2.35.1.473.g83b2b277ed-goog
> >
> > --
> > Michal Hocko
> > SUSE Labs

2022-02-23 11:42:19

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm: prevent vm_area_struct::anon_name refcount saturation

On Tue 22-02-22 19:02:08, Suren Baghdasaryan wrote:
> On Tue, Feb 22, 2022 at 7:56 AM Suren Baghdasaryan <[email protected]> wrote:
> >
> > On Tue, Feb 22, 2022 at 1:17 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Mon 21-02-22 21:40:24, Suren Baghdasaryan wrote:
> > > > A deep process chain with many vmas could grow really high.
> > >
> > > This would really benefit from some numbers. With default
> > > sysctl_max_map_count (64k) and default pid_max (32k) the INT_MAX could
> > > be theoretically reached but I find it impractical because not all vmas
> > > can be anonymous same as all available pids can be consumed for a
> > > theoretical attack (if my counting is proper).
> > > On the other hand any non-default configuration with any of the values
> > > increased could hit this theoretically.
> >
> > re: This would really benefit from some numbers
> > Should I just add the details you provided above into the description?
> > Would that suffice?
>
> Hmm. According to the defaults you posted, with max number of
> processes being 32k and max number of vmas per process 64k, the max
> number of vmas in the system is 2147450880. That's 32767 less than
> REFCOUNT_MAX=INT_MAX (2147483647) and 1073774592 less than
> REFCOUNT_SATURATED (3221225472). So with those defaults we should
> never hit these limits. Are we adding this protection for systems that
> set non-default higher limits or am I miscalculating something?

Yeah, I guess this should be the message the changelog should be
sending.
--
Michal Hocko
SUSE Labs

2022-02-23 14:59:55

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Tue 22-02-22 07:43:40, Suren Baghdasaryan wrote:
> On Tue, Feb 22, 2022 at 12:06 AM Michal Hocko <[email protected]> wrote:
> >
> > On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> > > When adjacent vmas are being merged it can result in the vma that was
> > > originally passed to madvise_update_vma being destroyed. In the current
> > > implementation, the name parameter passed to madvise_update_vma points
> > > directly to vma->anon_name->name and it is used after the call to
> > > vma_merge. In the cases when vma_merge merges the original vma and
> > > destroys it, this will result in use-after-free bug as shown below:
> > >
> > > madvise_vma_behavior << passes vma->anon_name->name as name param
> > > madvise_update_vma(name)
> > > vma_merge
> > > __vma_adjust
> > > vm_area_free <-- frees the vma
> > > replace_vma_anon_name(name) <-- UAF
> >
> > This seems to be stale because bare const char pointer is not passed in
> > the call chain. In fact I am not even sure there is any actual UAF here
> > after the rework.
> > Could you be more specific in describing the scenario?
>
> Yes, sorry, I need to update the part of the description talking about
> passing vma->anon_name->name directly.
> I think UAF is still there, it's just harder to reproduce (admittedly
> I could not reproduce it with the previous reproducer). The scenario
> would be when a vma with vma->anon_name->kref == 1 is being merged
> with another one and freed in the process:
>
> madvise_vma_behavior
> anon_name = vma_anon_name(vma) <-- does not increase refcount
> madvise_update_vma(anon_name)
> *prev = vma_merge <-- returns another vma
> __vma_adjust
> vm_area_free(vma)
> free_vma_anon_name
> anon_vma_name_put
> vma_anon_name_free <-- frees the vma->anon_name
> vma = *prev <-- original vma was freed

How come this is not a UAF in the first place?

> replace_vma_anon_name(vma, >>anon_name<<) <-- UAF
>
> Does this make sense or did I miss something?

Sorry for being dense but I still do not see it. If *prev has been freed
then we already have a different UAF. Admittedly, I am not really fluent
at vma_merge code path so I am not really sure your chain above is
really possible. I will try to double check later.
--
Michal Hocko
SUSE Labs

2022-02-23 16:14:36

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Wed, Feb 23, 2022 at 12:55 AM Michal Hocko <[email protected]> wrote:
>
> On Tue 22-02-22 07:43:40, Suren Baghdasaryan wrote:
> > On Tue, Feb 22, 2022 at 12:06 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> > > > When adjacent vmas are being merged it can result in the vma that was
> > > > originally passed to madvise_update_vma being destroyed. In the current
> > > > implementation, the name parameter passed to madvise_update_vma points
> > > > directly to vma->anon_name->name and it is used after the call to
> > > > vma_merge. In the cases when vma_merge merges the original vma and
> > > > destroys it, this will result in use-after-free bug as shown below:
> > > >
> > > > madvise_vma_behavior << passes vma->anon_name->name as name param
> > > > madvise_update_vma(name)
> > > > vma_merge
> > > > __vma_adjust
> > > > vm_area_free <-- frees the vma
> > > > replace_vma_anon_name(name) <-- UAF
> > >
> > > This seems to be stale because bare const char pointer is not passed in
> > > the call chain. In fact I am not even sure there is any actual UAF here
> > > after the rework.
> > > Could you be more specific in describing the scenario?
> >
> > Yes, sorry, I need to update the part of the description talking about
> > passing vma->anon_name->name directly.
> > I think UAF is still there, it's just harder to reproduce (admittedly
> > I could not reproduce it with the previous reproducer). The scenario
> > would be when a vma with vma->anon_name->kref == 1 is being merged
> > with another one and freed in the process:
> >
> > madvise_vma_behavior
> > anon_name = vma_anon_name(vma) <-- does not increase refcount
> > madvise_update_vma(anon_name)
> > *prev = vma_merge <-- returns another vma
> > __vma_adjust
> > vm_area_free(vma)
> > free_vma_anon_name
> > anon_vma_name_put
> > vma_anon_name_free <-- frees the vma->anon_name
> > vma = *prev <-- original vma was freed
>
> How come this is not a UAF in the first place?

Sorry, I got you confused. The original vma that was passed as a
parameter to vma_merge(vma) was freed and vma_merge() returns the area
it was merged with:

*prev = vma_merge(vma_a) <-- vma_a is merged with adjacent vma_b,
vma_a is freed and vma_b is returned
vma = *prev <-- "vma" now points to vma_b

>
> > replace_vma_anon_name(vma, >>anon_name<<) <-- UAF
> >
> > Does this make sense or did I miss something?
>
> Sorry for being dense but I still do not see it. If *prev has been freed
> then we already have a different UAF. Admittedly, I am not really fluent
> at vma_merge code path so I am not really sure your chain above is
> really possible. I will try to double check later.
> --
> Michal Hocko
> SUSE Labs

2022-02-24 00:54:27

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Wed 23-02-22 06:10:54, Suren Baghdasaryan wrote:
> On Wed, Feb 23, 2022 at 12:55 AM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 22-02-22 07:43:40, Suren Baghdasaryan wrote:
> > > On Tue, Feb 22, 2022 at 12:06 AM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> > > > > When adjacent vmas are being merged it can result in the vma that was
> > > > > originally passed to madvise_update_vma being destroyed. In the current
> > > > > implementation, the name parameter passed to madvise_update_vma points
> > > > > directly to vma->anon_name->name and it is used after the call to
> > > > > vma_merge. In the cases when vma_merge merges the original vma and
> > > > > destroys it, this will result in use-after-free bug as shown below:
> > > > >
> > > > > madvise_vma_behavior << passes vma->anon_name->name as name param
> > > > > madvise_update_vma(name)
> > > > > vma_merge
> > > > > __vma_adjust
> > > > > vm_area_free <-- frees the vma
> > > > > replace_vma_anon_name(name) <-- UAF
> > > >
> > > > This seems to be stale because bare const char pointer is not passed in
> > > > the call chain. In fact I am not even sure there is any actual UAF here
> > > > after the rework.
> > > > Could you be more specific in describing the scenario?
> > >
> > > Yes, sorry, I need to update the part of the description talking about
> > > passing vma->anon_name->name directly.
> > > I think UAF is still there, it's just harder to reproduce (admittedly
> > > I could not reproduce it with the previous reproducer). The scenario
> > > would be when a vma with vma->anon_name->kref == 1 is being merged
> > > with another one and freed in the process:
> > >
> > > madvise_vma_behavior
> > > anon_name = vma_anon_name(vma) <-- does not increase refcount
> > > madvise_update_vma(anon_name)
> > > *prev = vma_merge <-- returns another vma
> > > __vma_adjust
> > > vm_area_free(vma)
> > > free_vma_anon_name
> > > anon_vma_name_put
> > > vma_anon_name_free <-- frees the vma->anon_name
> > > vma = *prev <-- original vma was freed
> >
> > How come this is not a UAF in the first place?
>
> Sorry, I got you confused. The original vma that was passed as a
> parameter to vma_merge(vma) was freed and vma_merge() returns the area
> it was merged with:
>
> *prev = vma_merge(vma_a) <-- vma_a is merged with adjacent vma_b,
> vma_a is freed and vma_b is returned
> vma = *prev <-- "vma" now points to vma_b

OK, I see the problem now. Essentially if we have two adjacent vmas
which have the same name but wrapped by two anon_vma_name instances
and we merge them together then the first one can be freed along with
its anon_vma_name which is still used by replace_anon_vma_name.

Thanks for the patience.
--
Michal Hocko
SUSE Labs

2022-02-24 00:57:52

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH v4 3/3] mm: fix use-after-free when anon vma name is used after vma is freed

On Wed, Feb 23, 2022 at 7:29 AM Michal Hocko <[email protected]> wrote:
>
> On Wed 23-02-22 06:10:54, Suren Baghdasaryan wrote:
> > On Wed, Feb 23, 2022 at 12:55 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Tue 22-02-22 07:43:40, Suren Baghdasaryan wrote:
> > > > On Tue, Feb 22, 2022 at 12:06 AM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Mon 21-02-22 21:40:25, Suren Baghdasaryan wrote:
> > > > > > When adjacent vmas are being merged it can result in the vma that was
> > > > > > originally passed to madvise_update_vma being destroyed. In the current
> > > > > > implementation, the name parameter passed to madvise_update_vma points
> > > > > > directly to vma->anon_name->name and it is used after the call to
> > > > > > vma_merge. In the cases when vma_merge merges the original vma and
> > > > > > destroys it, this will result in use-after-free bug as shown below:
> > > > > >
> > > > > > madvise_vma_behavior << passes vma->anon_name->name as name param
> > > > > > madvise_update_vma(name)
> > > > > > vma_merge
> > > > > > __vma_adjust
> > > > > > vm_area_free <-- frees the vma
> > > > > > replace_vma_anon_name(name) <-- UAF
> > > > >
> > > > > This seems to be stale because bare const char pointer is not passed in
> > > > > the call chain. In fact I am not even sure there is any actual UAF here
> > > > > after the rework.
> > > > > Could you be more specific in describing the scenario?
> > > >
> > > > Yes, sorry, I need to update the part of the description talking about
> > > > passing vma->anon_name->name directly.
> > > > I think UAF is still there, it's just harder to reproduce (admittedly
> > > > I could not reproduce it with the previous reproducer). The scenario
> > > > would be when a vma with vma->anon_name->kref == 1 is being merged
> > > > with another one and freed in the process:
> > > >
> > > > madvise_vma_behavior
> > > > anon_name = vma_anon_name(vma) <-- does not increase refcount
> > > > madvise_update_vma(anon_name)
> > > > *prev = vma_merge <-- returns another vma
> > > > __vma_adjust
> > > > vm_area_free(vma)
> > > > free_vma_anon_name
> > > > anon_vma_name_put
> > > > vma_anon_name_free <-- frees the vma->anon_name
> > > > vma = *prev <-- original vma was freed
> > >
> > > How come this is not a UAF in the first place?
> >
> > Sorry, I got you confused. The original vma that was passed as a
> > parameter to vma_merge(vma) was freed and vma_merge() returns the area
> > it was merged with:
> >
> > *prev = vma_merge(vma_a) <-- vma_a is merged with adjacent vma_b,
> > vma_a is freed and vma_b is returned
> > vma = *prev <-- "vma" now points to vma_b
>
> OK, I see the problem now. Essentially if we have two adjacent vmas
> which have the same name but wrapped by two anon_vma_name instances
> and we merge them together then the first one can be freed along with
> its anon_vma_name which is still used by replace_anon_vma_name.
>
> Thanks for the patience.

Thanks for reviewing! I posted the next version of the patchset at
https://lore.kernel.org/linux-mm/[email protected]/

> --
> Michal Hocko
> SUSE Labs

2022-02-24 01:14:19

by Suren Baghdasaryan

[permalink] [raw]
Subject: Re: [PATCH 2/3] mm: prevent vm_area_struct::anon_name refcount saturation

On Tue, Feb 22, 2022 at 1:17 AM Michal Hocko <[email protected]> wrote:
>
> On Mon 21-02-22 21:40:24, Suren Baghdasaryan wrote:
> > A deep process chain with many vmas could grow really high.
>
> This would really benefit from some numbers. With default
> sysctl_max_map_count (64k) and default pid_max (32k) the INT_MAX could
> be theoretically reached but I find it impractical because not all vmas
> can be anonymous same as all available pids can be consumed for a
> theoretical attack (if my counting is proper).
> On the other hand any non-default configuration with any of the values
> increased could hit this theoretically.

re: This would really benefit from some numbers
Should I just add the details you provided above into the description?
Would that suffice?

>
> > kref
> > refcounting interface used in anon_vma_name structure will detect
> > a counter overflow when it reaches REFCOUNT_SATURATED value but will
> > only generate a warning about broken refcounting.
> > To ensure anon_vma_name refcount does not overflow, stop anon_vma_name
> > sharing when the refcount reaches INT_MAX, which still leaves INT_MAX/2
> > values before the counter reaches REFCOUNT_SATURATED. This should provide
> > enough headroom for raising the refcounts temporarily.
> >
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Suren Baghdasaryan <[email protected]>
> > ---
> > include/linux/mm_inline.h | 18 ++++++++++++++----
> > mm/madvise.c | 3 +--
> > 2 files changed, 15 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
> > index 70b619442d56..b189e2638843 100644
> > --- a/include/linux/mm_inline.h
> > +++ b/include/linux/mm_inline.h
> > @@ -156,15 +156,25 @@ static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
> >
> > extern void anon_vma_name_put(struct anon_vma_name *anon_name);
> >
> > +static inline
> > +struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
> > +{
> > + /* Prevent anon_name refcount saturation early on */
> > + if (kref_read(&anon_name->kref) < INT_MAX) {
>
> REFCOUNT_MAX seems to be defined by the kref framework.

Ah, indeed. I missed that. Will change to use it.

>
> Other than that looks good to me.

Thanks for the review!

>
> > + anon_vma_name_get(anon_name);
> > + return anon_name;
> > +
> > + }
> > + return anon_vma_name_alloc(anon_name->name);
> > +}
> > +
> > static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
> > struct vm_area_struct *new_vma)
> > {
> > struct anon_vma_name *anon_name = vma_anon_name(orig_vma);
> >
> > - if (anon_name) {
> > - anon_vma_name_get(anon_name);
> > - new_vma->anon_name = anon_name;
> > - }
> > + if (anon_name)
> > + new_vma->anon_name = anon_vma_name_reuse(anon_name);
> > }
> >
> > static inline void free_vma_anon_name(struct vm_area_struct *vma)
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index f81d62d8ce9b..a395884aeecb 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -122,8 +122,7 @@ static int replace_vma_anon_name(struct vm_area_struct *vma,
> > if (anon_vma_name_eq(orig_name, anon_name))
> > return 0;
> >
> > - anon_vma_name_get(anon_name);
> > - vma->anon_name = anon_name;
> > + vma->anon_name = anon_vma_name_reuse(anon_name);
> > anon_vma_name_put(orig_name);
> >
> > return 0;
> > --
> > 2.35.1.473.g83b2b277ed-goog
>
> --
> Michal Hocko
> SUSE Labs