From: Colin Cross <[email protected]>
Refactor the madvise syscall to allow for parts of it to be reused by a
prctl syscall that affects vmas.
Move the code that walks vmas in a virtual address range into a function
that takes a function pointer as a parameter. The only caller for now is
sys_madvise, which uses it to call madvise_vma_behavior on each vma, but
the next patch will add an additional caller.
Move handling all vma behaviors inside madvise_behavior, and rename it to
madvise_vma_behavior.
Move the code that updates the flags on a vma, including splitting or
merging the vma as necessary, into a new function called
madvise_update_vma. The next patch will add support for updating a new
anon_name field as well.
Signed-off-by: Colin Cross <[email protected]>
Cc: Pekka Enberg <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: Jan Glauber <[email protected]>
Cc: John Stultz <[email protected]>
Cc: Rob Landley <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: "Serge E. Hallyn" <[email protected]>
Cc: David Rientjes <[email protected]>
Cc: Al Viro <[email protected]>
Cc: Hugh Dickins <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Shaohua Li <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Minchan Kim <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
[sumits: rebased over v5.9-rc3]
Signed-off-by: Sumit Semwal <[email protected]>
[surenb: rebased over v5.14-rc7]
Signed-off-by: Suren Baghdasaryan <[email protected]>
---
previous version at:
https://lore.kernel.org/linux-mm/[email protected]/
changes in v10
- Cleanup error handling code to return the error immediately instead of
assigning it to an intermediate variable first.
mm/madvise.c | 338 +++++++++++++++++++++++++++------------------------
1 file changed, 178 insertions(+), 160 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 0734db8d53a7..d057109d7d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
}
/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * necessary. Must be called with mmap_sem held for writing;
*/
-static long madvise_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long new_flags)
{
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
+ int error;
pgoff_t pgoff;
- unsigned long new_flags = vma->vm_flags;
-
- switch (behavior) {
- case MADV_NORMAL:
- new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
- break;
- case MADV_SEQUENTIAL:
- new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
- break;
- case MADV_DONTFORK:
- new_flags |= VM_DONTCOPY;
- break;
- case MADV_DOFORK:
- if (vma->vm_flags & VM_IO) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTCOPY;
- break;
- case MADV_WIPEONFORK:
- /* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED) {
- error = -EINVAL;
- goto out;
- }
- new_flags |= VM_WIPEONFORK;
- break;
- case MADV_KEEPONFORK:
- new_flags &= ~VM_WIPEONFORK;
- break;
- case MADV_DONTDUMP:
- new_flags |= VM_DONTDUMP;
- break;
- case MADV_DODUMP:
- if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTDUMP;
- break;
- case MADV_MERGEABLE:
- case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error)
- goto out_convert_errno;
- break;
- case MADV_HUGEPAGE:
- case MADV_NOHUGEPAGE:
- error = hugepage_madvise(vma, &new_flags, behavior);
- if (error)
- goto out_convert_errno;
- break;
- }
if (new_flags == vma->vm_flags) {
*prev = vma;
- goto out;
+ return 0;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -147,23 +91,19 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
+ if (unlikely(mm->map_count >= sysctl_max_map_count))
+ return -ENOMEM;
error = __split_vma(mm, vma, start, 1);
if (error)
- goto out_convert_errno;
+ return error;
}
if (end != vma->vm_end) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
+ if (unlikely(mm->map_count >= sysctl_max_map_count))
+ return -ENOMEM;
error = __split_vma(mm, vma, end, 0);
if (error)
- goto out_convert_errno;
+ return error;
}
success:
@@ -172,15 +112,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
*/
vma->vm_flags = new_flags;
-out_convert_errno:
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
-out:
- return error;
+ return 0;
}
#ifdef CONFIG_SWAP
@@ -930,6 +862,94 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+/*
+ * Apply an madvise behavior to a region of a vma. madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long behavior)
+{
+ int error;
+ unsigned long new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_REMOVE:
+ return madvise_remove(vma, prev, start, end);
+ case MADV_WILLNEED:
+ return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return madvise_populate(vma, prev, start, end, behavior);
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO)
+ return -EINVAL;
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ return -EINVAL;
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
+ case MADV_DONTDUMP:
+ new_flags |= VM_DONTDUMP;
+ break;
+ case MADV_DODUMP:
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+ return -EINVAL;
+ new_flags &= ~VM_DONTDUMP;
+ break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out;
+ break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
+ }
+
+ error = madvise_update_vma(vma, prev, start, end, new_flags);
+
+out:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
#ifdef CONFIG_MEMORY_FAILURE
/*
* Error injection support for memory error handling.
@@ -978,30 +998,6 @@ static int madvise_inject_error(int behavior,
}
#endif
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
- case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
- case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
- case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
- case MADV_FREE:
- case MADV_DONTNEED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
- default:
- return madvise_behavior(vma, prev, start, end, behavior);
- }
-}
-
static bool
madvise_behavior_valid(int behavior)
{
@@ -1055,6 +1051,73 @@ process_madvise_behavior_valid(int behavior)
}
}
+/*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range. Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_lock held for reading or writing.
+ */
+static
+int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long arg,
+ int (*visit)(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long arg))
+{
+ struct vm_area_struct *vma;
+ struct vm_area_struct *prev;
+ unsigned long tmp;
+ int unmapped_error = 0;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
+ */
+ vma = find_vma_prev(mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ int error;
+
+ /* Still start < end. */
+ if (!vma)
+ return -ENOMEM;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ break;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = visit(vma, &prev, start, tmp, arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ if (start >= end)
+ break;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_lock */
+ vma = find_vma(mm, start);
+ }
+
+ return unmapped_error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1127,10 +1190,8 @@ process_madvise_behavior_valid(int behavior)
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end, tmp;
- struct vm_area_struct *vma, *prev;
- int unmapped_error = 0;
- int error = -EINVAL;
+ unsigned long end;
+ int error;
int write;
size_t len;
struct blk_plug plug;
@@ -1138,23 +1199,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr(start);
if (!madvise_behavior_valid(behavior))
- return error;
+ return -EINVAL;
if (!PAGE_ALIGNED(start))
- return error;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return error;
+ return -EINVAL;
end = start + len;
if (end < start)
- return error;
+ return -EINVAL;
- error = 0;
if (end == start)
- return error;
+ return 0;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -1169,51 +1229,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
mmap_read_lock(mm);
}
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
- */
- vma = find_vma_prev(mm, start, &prev);
- if (vma && start > vma->vm_start)
- prev = vma;
-
blk_start_plug(&plug);
- for (;;) {
- /* Still start < end. */
- error = -ENOMEM;
- if (!vma)
- goto out;
-
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
- goto out;
- }
-
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
-
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(vma, &prev, start, tmp, behavior);
- if (error)
- goto out;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- error = unmapped_error;
- if (start >= end)
- goto out;
- if (prev)
- vma = prev->vm_next;
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(mm, start);
- }
-out:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);
--
2.33.0.800.g4c38ced690-goog
From: Colin Cross <[email protected]>
In many userspace applications, and especially in VM based applications
like Android uses heavily, there are multiple different allocators in use.
At a minimum there is libc malloc and the stack, and in many cases there
are libc malloc, the stack, direct syscalls to mmap anonymous memory, and
multiple VM heaps (one for small objects, one for big objects, etc.).
Each of these layers usually has its own tools to inspect its usage;
malloc by compiling a debug version, the VM through heap inspection tools,
and for direct syscalls there is usually no way to track them.
On Android we heavily use a set of tools that use an extended version of
the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped
in userspace and slice their usage by process, shared (COW) vs. unique
mappings, backing, etc. This can account for real physical memory usage
even in cases like fork without exec (which Android uses heavily to share
as many private COW pages as possible between processes), Kernel SamePage
Merging, and clean zero pages. It produces a measurement of the pages
that only exist in that process (USS, for unique), and a measurement of
the physical memory usage of that process with the cost of shared pages
being evenly split between processes that share them (PSS).
If all anonymous memory is indistinguishable then figuring out the real
physical memory usage (PSS) of each heap requires either a pagemap walking
tool that can understand the heap debugging of every layer, or for every
layer's heap debugging tools to implement the pagemap walking logic, in
which case it is hard to get a consistent view of memory across the whole
system.
Tracking the information in userspace leads to all sorts of problems.
It either needs to be stored inside the process, which means every
process has to have an API to export its current heap information upon
request, or it has to be stored externally in a filesystem that
somebody needs to clean up on crashes. It needs to be readable while
the process is still running, so it has to have some sort of
synchronization with every layer of userspace. Efficiently tracking
the ranges requires reimplementing something like the kernel vma
trees, and linking to it from every layer of userspace. It requires
more memory, more syscalls, more runtime cost, and more complexity to
separately track regions that the kernel is already tracking.
This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a
userspace-provided name for anonymous vmas. The names of named anonymous
vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:<name>].
Userspace can set the name for a region of memory by calling
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
Setting the name to NULL clears it. The name length limit is 80 bytes
including NUL-terminator and is checked to contain only printable ascii
characters (including space), except '[',']','\','$' and '`'.
The name is stored in a pointer in the shared union in vm_area_struct
that points to a null terminated string. Anonymous vmas with the same
name (equivalent strings) and are otherwise mergeable will be merged.
The name pointers are not shared between vmas even if they contain the
same name. The name pointer is stored in a union with fields that are
only used on file-backed mappings, so it does not increase memory usage.
The patch is based on the original patch developed by Colin Cross, more
specifically on its latest version [1] posted upstream by Sumit Semwal.
It used a userspace pointer to store vma names. In that design, name
pointers could be shared between vmas. However during the last upstreaming
attempt, Kees Cook raised concerns [2] about this approach and suggested
to copy the name into kernel memory space, perform validity checks [3]
and store as a string referenced from vm_area_struct.
One big concern is about fork() performance which would need to strdup
anonymous vma names. Dave Hansen suggested experimenting with worst-case
scenario of forking a process with 64k vmas having longest possible names
[4]. I ran this experiment on an ARM64 Android device and recorded a
worst-case regression of almost 40% when forking such a process. This
regression is addressed in the followup patch which replaces the pointer
to a name with a refcounted structure that allows sharing the name pointer
between vmas of the same name. Instead of duplicating the string during
fork() or when splitting a vma it increments the refcount.
[1] https://lore.kernel.org/linux-mm/[email protected]/
[2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/
[3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/
[4] https://lore.kernel.org/linux-mm/[email protected]/
Changes for prctl(2) manual page (in the options section):
PR_SET_VMA
Sets an attribute specified in arg2 for virtual memory areas
starting from the address specified in arg3 and spanning the
size specified in arg4. arg5 specifies the value of the attribute
to be set. Note that assigning an attribute to a virtual memory
area might prevent it from being merged with adjacent virtual
memory areas due to the difference in that attribute's value.
Currently, arg2 must be one of:
PR_SET_VMA_ANON_NAME
Set a name for anonymous virtual memory areas. arg5 should
be a pointer to a null-terminated string containing the
name. The name length including null byte cannot exceed
80 bytes. If arg5 is NULL, the name of the appropriate
anonymous virtual memory areas will be reset. The name
can contain only printable ascii characters (including
space), except '[',']','\','$' and '`'.
Signed-off-by: Colin Cross <[email protected]>
[surenb: rebased over v5.14-rc7, replaced userpointer with a kernel copy
and added input sanitization. The bulk of the work here was done by Colin
Cross, therefore, with his permission, keeping him as the author]
Signed-off-by: Suren Baghdasaryan <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
---
previous version at:
https://lore.kernel.org/linux-mm/[email protected]/
changes in v10
- Removed WARN_ON call in replace_vma_anon_name, per Michal Hocko
- Changed the error code returned by madvise_vma_anon_name() when the mapping
is not anonymous, per Kees Cook
- Added comment in madvise_vma_anon_name() explaining why ENOMEM is replaced
with EAGAIN, per Kees Cook
- Removed extra space in can_vma_merge_after(), per Kees Cook.
- Changed is_valid_name_char() to disallow ` and $ chars, per Kees Cook
- Refactored prctl_set_vma() to avoid a goto, per Kees Cook
- Changed the name limit to 80, per Rasmus Villemoes
- Changed is_same_vma_anon_name() to be more clear about the check,
per Rasmus Villemoes
- Added #ifdef CONFIG_PROC_FS for madvise_vma_anon_name() and
madvise_set_anon_name() to fix builds with CONFIG_PROC_FS=n config
Documentation/filesystems/proc.rst | 2 +
fs/proc/task_mmu.c | 12 ++-
fs/userfaultfd.c | 7 +-
include/linux/mm.h | 13 ++-
include/linux/mm_types.h | 49 +++++++++++-
include/uapi/linux/prctl.h | 3 +
kernel/fork.c | 2 +
kernel/sys.c | 62 +++++++++++++++
mm/madvise.c | 122 +++++++++++++++++++++++++++--
mm/mempolicy.c | 3 +-
mm/mlock.c | 2 +-
mm/mmap.c | 38 +++++----
mm/mprotect.c | 2 +-
13 files changed, 284 insertions(+), 33 deletions(-)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 042c418f4090..a067eec54ef1 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -431,6 +431,8 @@ is not associated with a file:
[stack] the stack of the main process
[vdso] the "virtual dynamic shared object",
the kernel system call handler
+[anon:<name>] an anonymous mapping that has been
+ named by userspace
======= ====================================
or if empty, the mapping is anonymous.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cf25be3e0321..809e794dc0c5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -308,6 +308,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
+ const char *anon_name;
+
if (!mm) {
name = "[vdso]";
goto done;
@@ -319,8 +321,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (is_stack(vma))
+ if (is_stack(vma)) {
name = "[stack]";
+ goto done;
+ }
+
+ anon_name = vma_anon_name(vma);
+ if (anon_name) {
+ seq_pad(m, ' ');
+ seq_printf(m, "[anon:%s]", anon_name);
+ }
}
done:
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 003f0d31743e..941db1b80b15 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -877,7 +877,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev)
vma = prev;
else
@@ -1436,7 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- ((struct vm_userfaultfd_ctx){ ctx }));
+ ((struct vm_userfaultfd_ctx){ ctx }),
+ vma_anon_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1613,7 +1614,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..07f54c4d4940 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2548,7 +2548,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *, struct vm_userfaultfd_ctx);
+ struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
@@ -3284,5 +3284,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
return 0;
}
+#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_PROC_FS)
+int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name);
+#else
+static inline int
+madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name) {
+ return 0;
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..f1896c9b2da9 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -350,11 +350,19 @@ struct vm_area_struct {
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
+ *
+ * For private anonymous mappings, a pointer to a null terminated string
+ * containing the name given to the vma, or NULL if unnamed.
*/
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
+
+ union {
+ struct {
+ struct rb_node rb;
+ unsigned long rb_subtree_last;
+ } shared;
+ /* Serialized by mmap_sem. */
+ char *anon_name;
+ };
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -809,4 +817,37 @@ typedef struct {
unsigned long val;
} swp_entry_t;
+/*
+ * mmap_lock should be read-locked when calling vma_anon_name() and while using
+ * the returned pointer.
+ */
+extern const char *vma_anon_name(struct vm_area_struct *vma);
+
+/*
+ * mmap_lock should be read-locked for orig_vma->vm_mm.
+ * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
+ * isolated.
+ */
+extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma);
+
+/*
+ * mmap_lock should be write-locked or vma should have been isolated under
+ * write-locked mmap_lock protection.
+ */
+extern void free_vma_anon_name(struct vm_area_struct *vma);
+
+/* mmap_lock should be read-locked */
+static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
+ const char *name)
+{
+ const char *vma_name = vma_anon_name(vma);
+
+ /* either both NULL, or pointers to same string */
+ if (vma_name == name)
+ return true;
+
+ return name && vma_name && !strcmp(name, vma_name);
+}
+
#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 43bd7f713c39..4c8cbf510b2d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -269,4 +269,7 @@ struct prctl_mm_map {
# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
# define PR_SCHED_CORE_MAX 4
+#define PR_SET_VMA 0x53564d41
+# define PR_SET_VMA_ANON_NAME 0
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..bb7ac41048c2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -366,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
+ dup_vma_anon_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
+ free_vma_anon_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 8fdac0d90504..aea3b3b45356 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2261,6 +2261,65 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
+#ifdef CONFIG_MMU
+
+#define ANON_VMA_NAME_MAX_LEN 80
+#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+
+static inline bool is_valid_name_char(char ch)
+{
+ /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
+ return ch > 0x1f && ch < 0x7f &&
+ !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long addr,
+ unsigned long size, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ const char __user *uname;
+ char *name, *pch;
+ int error;
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ uname = (const char __user *)arg;
+ if (uname) {
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ for (pch = name; *pch != '\0'; pch++) {
+ if (!is_valid_name_char(*pch)) {
+ kfree(name);
+ return -EINVAL;
+ }
+ }
+ } else {
+ /* Reset the name */
+ name = NULL;
+ }
+
+ mmap_write_lock(mm);
+ error = madvise_set_anon_name(mm, addr, size, name);
+ mmap_write_unlock(mm);
+ kfree(name);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long size, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2530,6 +2589,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/mm/madvise.c b/mm/madvise.c
index d057109d7d17..69729930dde1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -18,6 +18,7 @@
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
@@ -62,19 +63,75 @@ static int madvise_need_mmap_write(int behavior)
}
}
+static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+{
+ return !vma->vm_file && vma->anon_name;
+}
+
+const char *vma_anon_name(struct vm_area_struct *vma)
+{
+ if (!has_vma_anon_name(vma))
+ return NULL;
+
+ mmap_assert_locked(vma->vm_mm);
+
+ return vma->anon_name;
+}
+
+void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
+{
+ if (!has_vma_anon_name(orig_vma))
+ return;
+
+ new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL);
+}
+
+void free_vma_anon_name(struct vm_area_struct *vma)
+{
+ if (!has_vma_anon_name(vma))
+ return;
+
+ kfree(vma->anon_name);
+ vma->anon_name = NULL;
+}
+
+/* mmap_lock should be write-locked */
+static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+{
+ if (!name) {
+ free_vma_anon_name(vma);
+ return 0;
+ }
+
+ if (vma->anon_name) {
+ /* Same name, nothing to do here */
+ if (!strcmp(name, vma->anon_name))
+ return 0;
+
+ free_vma_anon_name(vma);
+ }
+ vma->anon_name = kstrdup(name, GFP_KERNEL);
+ if (!vma->anon_name)
+ return -ENOMEM;
+
+ return 0;
+}
+
/*
- * Update the vm_flags on regiion of a vma, splitting it or merging it as
+ * Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long new_flags)
+ unsigned long end, unsigned long new_flags,
+ const char *name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
- if (new_flags == vma->vm_flags) {
+ if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
*prev = vma;
return 0;
}
@@ -82,7 +139,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, name);
if (*prev) {
vma = *prev;
goto success;
@@ -111,6 +168,11 @@ static int madvise_update_vma(struct vm_area_struct *vma,
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
+ if (!vma->vm_file) {
+ error = replace_vma_anon_name(vma, name);
+ if (error)
+ return error;
+ }
return 0;
}
@@ -938,7 +1000,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
- error = madvise_update_vma(vma, prev, start, end, new_flags);
+ error = madvise_update_vma(vma, prev, start, end, new_flags,
+ vma_anon_name(vma));
out:
/*
@@ -1118,6 +1181,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
return unmapped_error;
}
+#ifdef CONFIG_PROC_FS
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long name)
+{
+ int error;
+
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file)
+ return -EBADF;
+
+ error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+ (const char *)name);
+
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name)
+{
+ unsigned long end;
+ unsigned long len;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ madvise_vma_anon_name);
+}
+#endif /* CONFIG_PROC_FS */
/*
* The madvise(2) system call.
*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1592b081c58e..187c34a3753e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
+ new_pol, vma->vm_userfaultfd_ctx,
+ vma_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 16d2ee160d43..c878515680af 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -511,7 +511,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 88dcc5c25225..4375ee8b1926 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1029,7 +1029,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1047,6 +1048,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
+ if (!is_same_vma_anon_name(vma, anon_name))
+ return 0;
return 1;
}
@@ -1079,9 +1082,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1100,9 +1104,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1113,9 +1118,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1160,7 +1165,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1190,7 +1196,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@@ -1199,7 +1205,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
- vm_userfaultfd_ctx) &&
+ vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1222,7 +1228,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1755,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -1804,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
@@ -3057,7 +3063,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -3250,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..a48ff8e79f48 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
--
2.33.0.800.g4c38ced690-goog
While forking a process with high number (64K) of named anonymous vmas the
overhead caused by strdup() is noticeable. Experiments with ARM64 Android
device show up to 40% performance regression when forking a process with
64k unpopulated anonymous vmas using the max name lengths vs the same
process with the same number of anonymous vmas having no name.
Introduce anon_vma_name refcounted structure to avoid the overhead of
copying vma names during fork() and when splitting named anonymous vmas.
When a vma is duplicated, instead of copying the name we increment the
refcount of this structure. Multiple vmas can point to the same
anon_vma_name as long as they increment the refcount. The name member of
anon_vma_name structure is assigned at structure allocation time and is
never changed. If vma name changes then the refcount of the original
structure is dropped, a new anon_vma_name structure is allocated
to hold the new name and the vma pointer is updated to point to the new
structure.
With this approach the fork() performance regressions is reduced 3-4x
times and with usecases using more reasonable number of VMAs (a few
thousand) the regressions is not measurable.
Signed-off-by: Suren Baghdasaryan <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
---
previous version at:
https://lore.kernel.org/linux-mm/[email protected]/
changes in v10
- Changed anon_vma_name_alloc() to use struct_size() and avoid strcpy(),
per Kees Cook
include/linux/mm_types.h | 9 ++++++++-
mm/madvise.c | 42 ++++++++++++++++++++++++++++++++++------
2 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f1896c9b2da9..30c571b4f8d3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -5,6 +5,7 @@
#include <linux/mm_types_task.h>
#include <linux/auxvec.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
@@ -310,6 +311,12 @@ struct vm_userfaultfd_ctx {
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */
+struct anon_vma_name {
+ struct kref kref;
+ /* The name needs to be at the end because it is dynamically sized. */
+ char name[];
+};
+
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -361,7 +368,7 @@ struct vm_area_struct {
unsigned long rb_subtree_last;
} shared;
/* Serialized by mmap_sem. */
- char *anon_name;
+ struct anon_vma_name *anon_name;
};
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 69729930dde1..2f1ec13faaaa 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -63,6 +63,29 @@ static int madvise_need_mmap_write(int behavior)
}
}
+static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ struct anon_vma_name *anon_name;
+ size_t count;
+
+ /* Add 1 for NUL terminator at the end of the anon_name->name */
+ count = strlen(name) + 1;
+ anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
+ if (anon_name) {
+ kref_init(&anon_name->kref);
+ memcpy(anon_name->name, name, count);
+ }
+
+ return anon_name;
+}
+
+static void vma_anon_name_free(struct kref *kref)
+{
+ struct anon_vma_name *anon_name =
+ container_of(kref, struct anon_vma_name, kref);
+ kfree(anon_name);
+}
+
static inline bool has_vma_anon_name(struct vm_area_struct *vma)
{
return !vma->vm_file && vma->anon_name;
@@ -75,7 +98,7 @@ const char *vma_anon_name(struct vm_area_struct *vma)
mmap_assert_locked(vma->vm_mm);
- return vma->anon_name;
+ return vma->anon_name->name;
}
void dup_vma_anon_name(struct vm_area_struct *orig_vma,
@@ -84,34 +107,41 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma,
if (!has_vma_anon_name(orig_vma))
return;
- new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL);
+ kref_get(&orig_vma->anon_name->kref);
+ new_vma->anon_name = orig_vma->anon_name;
}
void free_vma_anon_name(struct vm_area_struct *vma)
{
+ struct anon_vma_name *anon_name;
+
if (!has_vma_anon_name(vma))
return;
- kfree(vma->anon_name);
+ anon_name = vma->anon_name;
vma->anon_name = NULL;
+ kref_put(&anon_name->kref, vma_anon_name_free);
}
/* mmap_lock should be write-locked */
static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
{
+ const char *anon_name;
+
if (!name) {
free_vma_anon_name(vma);
return 0;
}
- if (vma->anon_name) {
+ anon_name = vma_anon_name(vma);
+ if (anon_name) {
/* Same name, nothing to do here */
- if (!strcmp(name, vma->anon_name))
+ if (!strcmp(name, anon_name))
return 0;
free_vma_anon_name(vma);
}
- vma->anon_name = kstrdup(name, GFP_KERNEL);
+ vma->anon_name = anon_vma_name_alloc(name);
if (!vma->anon_name)
return -ENOMEM;
--
2.33.0.800.g4c38ced690-goog
On Fri, 1 Oct 2021 13:56:56 -0700 Suren Baghdasaryan <[email protected]> wrote:
> From: Colin Cross <[email protected]>
>
> In many userspace applications, and especially in VM based applications
> like Android uses heavily, there are multiple different allocators in use.
> At a minimum there is libc malloc and the stack, and in many cases there
> are libc malloc, the stack, direct syscalls to mmap anonymous memory, and
> multiple VM heaps (one for small objects, one for big objects, etc.).
> Each of these layers usually has its own tools to inspect its usage;
> malloc by compiling a debug version, the VM through heap inspection tools,
> and for direct syscalls there is usually no way to track them.
>
> On Android we heavily use a set of tools that use an extended version of
> the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped
> in userspace and slice their usage by process, shared (COW) vs. unique
> mappings, backing, etc. This can account for real physical memory usage
> even in cases like fork without exec (which Android uses heavily to share
> as many private COW pages as possible between processes), Kernel SamePage
> Merging, and clean zero pages. It produces a measurement of the pages
> that only exist in that process (USS, for unique), and a measurement of
> the physical memory usage of that process with the cost of shared pages
> being evenly split between processes that share them (PSS).
>
> If all anonymous memory is indistinguishable then figuring out the real
> physical memory usage (PSS) of each heap requires either a pagemap walking
> tool that can understand the heap debugging of every layer, or for every
> layer's heap debugging tools to implement the pagemap walking logic, in
> which case it is hard to get a consistent view of memory across the whole
> system.
>
> Tracking the information in userspace leads to all sorts of problems.
> It either needs to be stored inside the process, which means every
> process has to have an API to export its current heap information upon
> request, or it has to be stored externally in a filesystem that
> somebody needs to clean up on crashes. It needs to be readable while
> the process is still running, so it has to have some sort of
> synchronization with every layer of userspace. Efficiently tracking
> the ranges requires reimplementing something like the kernel vma
> trees, and linking to it from every layer of userspace. It requires
> more memory, more syscalls, more runtime cost, and more complexity to
> separately track regions that the kernel is already tracking.
>
> This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a
> userspace-provided name for anonymous vmas. The names of named anonymous
> vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:<name>].
>
> Userspace can set the name for a region of memory by calling
> prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
So this can cause a vma to be split, if [start,len] doesn't exactly
describe an existing vma? If so, is this at all useful? If not then
`len' isn't needed - just pass in some address within an existing vma?
> Setting the name to NULL clears it. The name length limit is 80 bytes
> including NUL-terminator and is checked to contain only printable ascii
> characters (including space), except '[',']','\','$' and '`'.
>
> The name is stored in a pointer in the shared union in vm_area_struct
> that points to a null terminated string. Anonymous vmas with the same
> name (equivalent strings) and are otherwise mergeable will be merged.
So this can prevent vma merging if used incorrectly (or maliciously -
can't think how)? What are the potential impacts of this?
> The name pointers are not shared between vmas even if they contain the
> same name. The name pointer is stored in a union with fields that are
> only used on file-backed mappings, so it does not increase memory usage.
>
> The patch is based on the original patch developed by Colin Cross, more
> specifically on its latest version [1] posted upstream by Sumit Semwal.
> It used a userspace pointer to store vma names. In that design, name
> pointers could be shared between vmas. However during the last upstreaming
> attempt, Kees Cook raised concerns [2] about this approach and suggested
> to copy the name into kernel memory space, perform validity checks [3]
> and store as a string referenced from vm_area_struct.
> One big concern is about fork() performance which would need to strdup
> anonymous vma names. Dave Hansen suggested experimenting with worst-case
> scenario of forking a process with 64k vmas having longest possible names
> [4]. I ran this experiment on an ARM64 Android device and recorded a
> worst-case regression of almost 40% when forking such a process. This
> regression is addressed in the followup patch which replaces the pointer
> to a name with a refcounted structure that allows sharing the name pointer
> between vmas of the same name. Instead of duplicating the string during
> fork() or when splitting a vma it increments the refcount.
Generally, the patch adds a bunch of code which a lot of users won't
want. Did we bust a gut to reduce this impact? Was a standalone
config setting considered?
And what would be the impact of making this feature optional? Is a
proliferation of formats in /proc/pid/maps going to make userspace
parsers harder to develop and test?
I do think that saying "The names of named anonymous vmas are shown in
/proc/pid/maps and /proc/pid/smaps as [anon:<name>]." is a bit thin.
Please provide sample output so we can consider these things better.
What are the risks that existing parsers will be broken by such changes?
On Fri, Oct 1, 2021 at 4:08 PM Andrew Morton <[email protected]> wrote:
>
> On Fri, 1 Oct 2021 13:56:56 -0700 Suren Baghdasaryan <[email protected]> wrote:
>
> > From: Colin Cross <[email protected]>
> >
> > In many userspace applications, and especially in VM based applications
> > like Android uses heavily, there are multiple different allocators in use.
> > At a minimum there is libc malloc and the stack, and in many cases there
> > are libc malloc, the stack, direct syscalls to mmap anonymous memory, and
> > multiple VM heaps (one for small objects, one for big objects, etc.).
> > Each of these layers usually has its own tools to inspect its usage;
> > malloc by compiling a debug version, the VM through heap inspection tools,
> > and for direct syscalls there is usually no way to track them.
> >
> > On Android we heavily use a set of tools that use an extended version of
> > the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped
> > in userspace and slice their usage by process, shared (COW) vs. unique
> > mappings, backing, etc. This can account for real physical memory usage
> > even in cases like fork without exec (which Android uses heavily to share
> > as many private COW pages as possible between processes), Kernel SamePage
> > Merging, and clean zero pages. It produces a measurement of the pages
> > that only exist in that process (USS, for unique), and a measurement of
> > the physical memory usage of that process with the cost of shared pages
> > being evenly split between processes that share them (PSS).
> >
> > If all anonymous memory is indistinguishable then figuring out the real
> > physical memory usage (PSS) of each heap requires either a pagemap walking
> > tool that can understand the heap debugging of every layer, or for every
> > layer's heap debugging tools to implement the pagemap walking logic, in
> > which case it is hard to get a consistent view of memory across the whole
> > system.
> >
> > Tracking the information in userspace leads to all sorts of problems.
> > It either needs to be stored inside the process, which means every
> > process has to have an API to export its current heap information upon
> > request, or it has to be stored externally in a filesystem that
> > somebody needs to clean up on crashes. It needs to be readable while
> > the process is still running, so it has to have some sort of
> > synchronization with every layer of userspace. Efficiently tracking
> > the ranges requires reimplementing something like the kernel vma
> > trees, and linking to it from every layer of userspace. It requires
> > more memory, more syscalls, more runtime cost, and more complexity to
> > separately track regions that the kernel is already tracking.
> >
> > This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a
> > userspace-provided name for anonymous vmas. The names of named anonymous
> > vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:<name>].
> >
> > Userspace can set the name for a region of memory by calling
> > prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
>
> So this can cause a vma to be split, if [start,len] doesn't exactly
> describe an existing vma? If so, is this at all useful? If not then
> `len' isn't needed - just pass in some address within an existing vma?
Technically one could mmap a large chunk of memory and then assign
different names to its parts to use for different purposes, which
would cause the vma to split. I don't think Android uses it that way
but I'll have to double-check. I think one advantage of doing this
could be to minimize the number of mmap syscalls.
> > Setting the name to NULL clears it. The name length limit is 80 bytes
> > including NUL-terminator and is checked to contain only printable ascii
> > characters (including space), except '[',']','\','$' and '`'.
> >
> > The name is stored in a pointer in the shared union in vm_area_struct
> > that points to a null terminated string. Anonymous vmas with the same
> > name (equivalent strings) and are otherwise mergeable will be merged.
>
> So this can prevent vma merging if used incorrectly (or maliciously -
> can't think how)? What are the potential impacts of this?
Potential impact would be that otherwise mergeable vmas would not be
merged due to the name difference. This is a known downside of naming
an anon vma which I documented in my manual pages description as "Note
that assigning an attribute to a virtual memory area might prevent it
from being merged with adjacent virtual memory areas due to the
difference in that attribute's value.". In Android we see an increase
in the number of VMAs due to this feature but it was not significant.
I'll try to dig up the numbers or will rerun the test to get them.
If a process maliciously wants to increase the number of vmas in the
system it could generate lots of vmas with different names in its
address space, however this can be done even without this feature by
mapping vmas while toggling a protection bit. Something like this:
int prot = PROT_WRITE;
for (i = 0; i < count; i++) {
mmap(NULL, size_bytes, prot, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
prot = (prot ^ PROT_READ) & (PROT_READ | PROT_WRITE);
}
> > The name pointers are not shared between vmas even if they contain the
> > same name. The name pointer is stored in a union with fields that are
> > only used on file-backed mappings, so it does not increase memory usage.
> >
> > The patch is based on the original patch developed by Colin Cross, more
> > specifically on its latest version [1] posted upstream by Sumit Semwal.
> > It used a userspace pointer to store vma names. In that design, name
> > pointers could be shared between vmas. However during the last upstreaming
> > attempt, Kees Cook raised concerns [2] about this approach and suggested
> > to copy the name into kernel memory space, perform validity checks [3]
> > and store as a string referenced from vm_area_struct.
> > One big concern is about fork() performance which would need to strdup
> > anonymous vma names. Dave Hansen suggested experimenting with worst-case
> > scenario of forking a process with 64k vmas having longest possible names
> > [4]. I ran this experiment on an ARM64 Android device and recorded a
> > worst-case regression of almost 40% when forking such a process. This
> > regression is addressed in the followup patch which replaces the pointer
> > to a name with a refcounted structure that allows sharing the name pointer
> > between vmas of the same name. Instead of duplicating the string during
> > fork() or when splitting a vma it increments the refcount.
>
> Generally, the patch adds a bunch of code which a lot of users won't
> want. Did we bust a gut to reduce this impact? Was a standalone
> config setting considered?
I didn't consider a standalone config for this feature because when
not used it has no memory impact at runtime. As for the image size, I
built Linus' ToT with and without this patchset with allmodconfig and
the sizes are:
Without the patchset:
$ size vmlinux
text data bss dec hex filename
40763556 58424519 29016228 128204303 7a43e0f vmlinux
With the patchset:
$ size vmlinux
text data bss dec hex filename
40765068 58424671 29016228 128205967 7a4448f vmlinux
The increase seems quite small, so I'm not sure if it warrants a
separate config option.
> And what would be the impact of making this feature optional? Is a
> proliferation of formats in /proc/pid/maps going to make userspace
> parsers harder to develop and test?
I'm guessing having one format is simpler and therefore preferable?
> I do think that saying "The names of named anonymous vmas are shown in
> /proc/pid/maps and /proc/pid/smaps as [anon:<name>]." is a bit thin.
> Please provide sample output so we can consider these things better.
Sure. Here is a sample /proc/$pid/maps output (partial):
6ffacb6000-6ffacd6000 r--s 00000000 00:10 361
/dev/__properties__/properties_serial
6ffacd6000-6ffacd9000 rw-p 00000000 00:00 0
[anon:System property context nodes]
6ffacd9000-6ffaceb000 r--s 00000000 00:10 79
/dev/__properties__/property_info
6ffaceb000-6ffad4f000 r--p 00000000 00:00 0
[anon:linker_alloc]
6ffad4f000-6ffad51000 rw-p 00000000 00:00 0
[anon:bionic_alloc_small_objects]
6ffad51000-6ffad52000 r--p 00000000 00:00 0
[anon:atexit handlers]
6ffad52000-6ffbc2c000 ---p 00000000 00:00 0
6ffbc2c000-6ffbc2e000 rw-p 00000000 00:00 0
6ffbc2e000-6ffbd52000 ---p 00000000 00:00 0
6ffbd52000-6ffbd53000 ---p 00000000 00:00 0
6ffbd53000-6ffbd5b000 rw-p 00000000 00:00 0
[anon:thread signal stack]
6ffbd5b000-6ffbd5c000 rw-p 00000000 00:00 0
[anon:arc4random data]
6ffbd5c000-6ffbd5d000 r--p 0000d000 07:90 59
/apex/com.android.art/javalib/arm64/boot-okhttp.art
6ffbd5d000-6ffbd5e000 r--p 00014000 07:90 56
/apex/com.android.art/javalib/arm64/boot-core-libart.art
6ffbd5e000-6ffbd5f000 rw-p 00000000 00:00 0
[anon:arc4random data]
6ffbd5f000-6ffbd61000 r--p 00000000 00:00 0 [vvar]
6ffbd61000-6ffbd62000 r-xp 00000000 00:00 0 [vdso]
and sample /proc/$pid/smaps output (partial):
6ffad4f000-6ffad51000 rw-p 00000000 00:00 0
[anon:bionic_alloc_small_objects]
Size: 8 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
FilePmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 8 kB
SwapPss: 8 kB
Locked: 0 kB
THPeligible: 0
VmFlags: rd wr mr mw me ac
>
> What are the risks that existing parsers will be broken by such changes?
That I can't really answer. It would depend on how the parser is
written. The implementation follows the same pattern as [stack],
[vdso] and other non-filebacked sections are named, however if a
parser is written so that it does not ignore an unknown entry then it
would fail to parse [anon:...] name if some process decides to name
its anonymous vmas.
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
> }
>
> /*
> - * We can potentially split a vm area into separate
> - * areas, each area with its own behavior.
> + * Update the vm_flags on regiion of a vma, splitting it or merging it as
^^
Eike
--
Rolf Eike Beer, emlix GmbH, http://www.emlix.com
Fon +49 551 30664-0, Fax +49 551 30664-11
Gothaer Platz 3, 37083 Göttingen, Germany
Sitz der Gesellschaft: Göttingen, Amtsgericht Göttingen HR B 3160
Geschäftsführung: Heike Jordan, Dr. Uwe Kracke – Ust-IdNr.: DE 205 198 055
emlix - smart embedded open source
On Fri, Oct 1, 2021 at 5:52 PM Suren Baghdasaryan <[email protected]> wrote:
>
> On Fri, Oct 1, 2021 at 4:08 PM Andrew Morton <[email protected]> wrote:
> >
> > On Fri, 1 Oct 2021 13:56:56 -0700 Suren Baghdasaryan <[email protected]> wrote:
> >
> > > From: Colin Cross <[email protected]>
> > >
> > > In many userspace applications, and especially in VM based applications
> > > like Android uses heavily, there are multiple different allocators in use.
> > > At a minimum there is libc malloc and the stack, and in many cases there
> > > are libc malloc, the stack, direct syscalls to mmap anonymous memory, and
> > > multiple VM heaps (one for small objects, one for big objects, etc.).
> > > Each of these layers usually has its own tools to inspect its usage;
> > > malloc by compiling a debug version, the VM through heap inspection tools,
> > > and for direct syscalls there is usually no way to track them.
> > >
> > > On Android we heavily use a set of tools that use an extended version of
> > > the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped
> > > in userspace and slice their usage by process, shared (COW) vs. unique
> > > mappings, backing, etc. This can account for real physical memory usage
> > > even in cases like fork without exec (which Android uses heavily to share
> > > as many private COW pages as possible between processes), Kernel SamePage
> > > Merging, and clean zero pages. It produces a measurement of the pages
> > > that only exist in that process (USS, for unique), and a measurement of
> > > the physical memory usage of that process with the cost of shared pages
> > > being evenly split between processes that share them (PSS).
> > >
> > > If all anonymous memory is indistinguishable then figuring out the real
> > > physical memory usage (PSS) of each heap requires either a pagemap walking
> > > tool that can understand the heap debugging of every layer, or for every
> > > layer's heap debugging tools to implement the pagemap walking logic, in
> > > which case it is hard to get a consistent view of memory across the whole
> > > system.
> > >
> > > Tracking the information in userspace leads to all sorts of problems.
> > > It either needs to be stored inside the process, which means every
> > > process has to have an API to export its current heap information upon
> > > request, or it has to be stored externally in a filesystem that
> > > somebody needs to clean up on crashes. It needs to be readable while
> > > the process is still running, so it has to have some sort of
> > > synchronization with every layer of userspace. Efficiently tracking
> > > the ranges requires reimplementing something like the kernel vma
> > > trees, and linking to it from every layer of userspace. It requires
> > > more memory, more syscalls, more runtime cost, and more complexity to
> > > separately track regions that the kernel is already tracking.
> > >
> > > This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a
> > > userspace-provided name for anonymous vmas. The names of named anonymous
> > > vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:<name>].
> > >
> > > Userspace can set the name for a region of memory by calling
> > > prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name);
> >
> > So this can cause a vma to be split, if [start,len] doesn't exactly
> > describe an existing vma? If so, is this at all useful? If not then
> > `len' isn't needed - just pass in some address within an existing vma?
>
> Technically one could mmap a large chunk of memory and then assign
> different names to its parts to use for different purposes, which
> would cause the vma to split. I don't think Android uses it that way
> but I'll have to double-check. I think one advantage of doing this
> could be to minimize the number of mmap syscalls.
>
> > > Setting the name to NULL clears it. The name length limit is 80 bytes
> > > including NUL-terminator and is checked to contain only printable ascii
> > > characters (including space), except '[',']','\','$' and '`'.
> > >
> > > The name is stored in a pointer in the shared union in vm_area_struct
> > > that points to a null terminated string. Anonymous vmas with the same
> > > name (equivalent strings) and are otherwise mergeable will be merged.
> >
> > So this can prevent vma merging if used incorrectly (or maliciously -
> > can't think how)? What are the potential impacts of this?
>
> Potential impact would be that otherwise mergeable vmas would not be
> merged due to the name difference. This is a known downside of naming
> an anon vma which I documented in my manual pages description as "Note
> that assigning an attribute to a virtual memory area might prevent it
> from being merged with adjacent virtual memory areas due to the
> difference in that attribute's value.". In Android we see an increase
> in the number of VMAs due to this feature but it was not significant.
> I'll try to dig up the numbers or will rerun the test to get them.
> If a process maliciously wants to increase the number of vmas in the
> system it could generate lots of vmas with different names in its
> address space, however this can be done even without this feature by
> mapping vmas while toggling a protection bit. Something like this:
>
> int prot = PROT_WRITE;
> for (i = 0; i < count; i++) {
> mmap(NULL, size_bytes, prot, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> prot = (prot ^ PROT_READ) & (PROT_READ | PROT_WRITE);
> }
>
> > > The name pointers are not shared between vmas even if they contain the
> > > same name. The name pointer is stored in a union with fields that are
> > > only used on file-backed mappings, so it does not increase memory usage.
> > >
> > > The patch is based on the original patch developed by Colin Cross, more
> > > specifically on its latest version [1] posted upstream by Sumit Semwal.
> > > It used a userspace pointer to store vma names. In that design, name
> > > pointers could be shared between vmas. However during the last upstreaming
> > > attempt, Kees Cook raised concerns [2] about this approach and suggested
> > > to copy the name into kernel memory space, perform validity checks [3]
> > > and store as a string referenced from vm_area_struct.
> > > One big concern is about fork() performance which would need to strdup
> > > anonymous vma names. Dave Hansen suggested experimenting with worst-case
> > > scenario of forking a process with 64k vmas having longest possible names
> > > [4]. I ran this experiment on an ARM64 Android device and recorded a
> > > worst-case regression of almost 40% when forking such a process. This
> > > regression is addressed in the followup patch which replaces the pointer
> > > to a name with a refcounted structure that allows sharing the name pointer
> > > between vmas of the same name. Instead of duplicating the string during
> > > fork() or when splitting a vma it increments the refcount.
> >
> > Generally, the patch adds a bunch of code which a lot of users won't
> > want. Did we bust a gut to reduce this impact? Was a standalone
> > config setting considered?
>
> I didn't consider a standalone config for this feature because when
> not used it has no memory impact at runtime. As for the image size, I
> built Linus' ToT with and without this patchset with allmodconfig and
> the sizes are:
> Without the patchset:
> $ size vmlinux
> text data bss dec hex filename
> 40763556 58424519 29016228 128204303 7a43e0f vmlinux
>
> With the patchset:
> $ size vmlinux
> text data bss dec hex filename
> 40765068 58424671 29016228 128205967 7a4448f vmlinux
>
> The increase seems quite small, so I'm not sure if it warrants a
> separate config option.
Andrew, do you still think we need a separate CONFIG option? I fixed
the build issue when CONFIG_ADVISE_SYSCALLS=n and would like to post
the update but if you want to have a separate config then I can post
that together with the fix. Please let me know.
Thanks,
Suren.
>
> > And what would be the impact of making this feature optional? Is a
> > proliferation of formats in /proc/pid/maps going to make userspace
> > parsers harder to develop and test?
>
> I'm guessing having one format is simpler and therefore preferable?
>
> > I do think that saying "The names of named anonymous vmas are shown in
> > /proc/pid/maps and /proc/pid/smaps as [anon:<name>]." is a bit thin.
> > Please provide sample output so we can consider these things better.
>
> Sure. Here is a sample /proc/$pid/maps output (partial):
>
> 6ffacb6000-6ffacd6000 r--s 00000000 00:10 361
> /dev/__properties__/properties_serial
> 6ffacd6000-6ffacd9000 rw-p 00000000 00:00 0
> [anon:System property context nodes]
> 6ffacd9000-6ffaceb000 r--s 00000000 00:10 79
> /dev/__properties__/property_info
> 6ffaceb000-6ffad4f000 r--p 00000000 00:00 0
> [anon:linker_alloc]
> 6ffad4f000-6ffad51000 rw-p 00000000 00:00 0
> [anon:bionic_alloc_small_objects]
> 6ffad51000-6ffad52000 r--p 00000000 00:00 0
> [anon:atexit handlers]
> 6ffad52000-6ffbc2c000 ---p 00000000 00:00 0
> 6ffbc2c000-6ffbc2e000 rw-p 00000000 00:00 0
> 6ffbc2e000-6ffbd52000 ---p 00000000 00:00 0
> 6ffbd52000-6ffbd53000 ---p 00000000 00:00 0
> 6ffbd53000-6ffbd5b000 rw-p 00000000 00:00 0
> [anon:thread signal stack]
> 6ffbd5b000-6ffbd5c000 rw-p 00000000 00:00 0
> [anon:arc4random data]
> 6ffbd5c000-6ffbd5d000 r--p 0000d000 07:90 59
> /apex/com.android.art/javalib/arm64/boot-okhttp.art
> 6ffbd5d000-6ffbd5e000 r--p 00014000 07:90 56
> /apex/com.android.art/javalib/arm64/boot-core-libart.art
> 6ffbd5e000-6ffbd5f000 rw-p 00000000 00:00 0
> [anon:arc4random data]
> 6ffbd5f000-6ffbd61000 r--p 00000000 00:00 0 [vvar]
> 6ffbd61000-6ffbd62000 r-xp 00000000 00:00 0 [vdso]
>
> and sample /proc/$pid/smaps output (partial):
>
> 6ffad4f000-6ffad51000 rw-p 00000000 00:00 0
> [anon:bionic_alloc_small_objects]
> Size: 8 kB
> KernelPageSize: 4 kB
> MMUPageSize: 4 kB
> Rss: 0 kB
> Pss: 0 kB
> Shared_Clean: 0 kB
> Shared_Dirty: 0 kB
> Private_Clean: 0 kB
> Private_Dirty: 0 kB
> Referenced: 0 kB
> Anonymous: 0 kB
> LazyFree: 0 kB
> AnonHugePages: 0 kB
> ShmemPmdMapped: 0 kB
> FilePmdMapped: 0 kB
> Shared_Hugetlb: 0 kB
> Private_Hugetlb: 0 kB
> Swap: 8 kB
> SwapPss: 8 kB
> Locked: 0 kB
> THPeligible: 0
> VmFlags: rd wr mr mw me ac
>
> >
> > What are the risks that existing parsers will be broken by such changes?
>
> That I can't really answer. It would depend on how the parser is
> written. The implementation follows the same pattern as [stack],
> [vdso] and other non-filebacked sections are named, however if a
> parser is written so that it does not ignore an unknown entry then it
> would fail to parse [anon:...] name if some process decides to name
> its anonymous vmas.
On Mon, Oct 4, 2021 at 12:03 AM Rolf Eike Beer <[email protected]> wrote:
>
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
> > }
> >
> > /*
> > - * We can potentially split a vm area into separate
> > - * areas, each area with its own behavior.
> > + * Update the vm_flags on regiion of a vma, splitting it or merging it as
> ^^
Thanks! Will fix in the next version.
>
> Eike
> --
> Rolf Eike Beer, emlix GmbH, http://www.emlix.com
> Fon +49 551 30664-0, Fax +49 551 30664-11
> Gothaer Platz 3, 37083 Göttingen, Germany
> Sitz der Gesellschaft: Göttingen, Amtsgericht Göttingen HR B 3160
> Geschäftsführung: Heike Jordan, Dr. Uwe Kracke – Ust-IdNr.: DE 205 198 055
>
> emlix - smart embedded open source
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
> While forking a process with high number (64K) of named anonymous vmas the
> overhead caused by strdup() is noticeable. Experiments with ARM64
Android
I still believe you should simply use numbers and do the
numbers->strings mapping in userspace. We should not need to optimize
strdups in kernel...
Best regards,
Pavel
--
http://www.livejournal.com/~pavelmachek
On Tue, Oct 5, 2021 at 11:42 AM Pavel Machek <[email protected]> wrote:
>
> On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
> > While forking a process with high number (64K) of named anonymous vmas the
> > overhead caused by strdup() is noticeable. Experiments with ARM64
> Android
>
> I still believe you should simply use numbers and do the
> numbers->strings mapping in userspace. We should not need to optimize
> strdups in kernel...
Here are complications with mapping numbers to strings in the userspace:
Approach 1: hardcode number->string in some header file and let all
tools use that mapping. The issue is that whenever that mapping
changes all the tools that are using it (including 3rd party ones)
have to be rebuilt. This is not really maintainable since we don't
control 3rd party tools and even for the ones we control, it will be a
maintenance issue figuring out which version of the tool used which
header file.
Approach 2: have a centralized facility (a process or a DB)
maintaining number->string mapping. This would require an additional
request to this facility whenever we want to make a number->string
conversion. Moreover, when we want to name a VMA, we would have to
register a new VMA name in that facility or check that one already
exists and get its ID. So each prctl() call to name a VMA will be
preceded by such a request (IPC call), maybe with some optimizations
to cache already known number->string pairs. This would be quite
expensive performance-wise. Additional issue with this approach is
that this mapping will have to be persistent to handle a case when the
facility crashes and has to be restored.
As I said before, it complicates userspace quite a bit. Is that a good
enough reason to store the names in the kernel and pay a little more
memory for that? IMHO yes, but I might be wrong.
Thanks,
Suren.
>
> Best regards,
> Pavel
> --
> http://www.livejournal.com/~pavelmachek
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
On Tue, Oct 05, 2021 at 12:14:59PM -0700, Suren Baghdasaryan wrote:
> On Tue, Oct 5, 2021 at 11:42 AM Pavel Machek <[email protected]> wrote:
> >
> > On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
> > > While forking a process with high number (64K) of named anonymous vmas the
> > > overhead caused by strdup() is noticeable. Experiments with ARM64
> > Android
> >
> > I still believe you should simply use numbers and do the
> > numbers->strings mapping in userspace. We should not need to optimize
> > strdups in kernel...
>
> Here are complications with mapping numbers to strings in the userspace:
> Approach 1: hardcode number->string in some header file and let all
> tools use that mapping. The issue is that whenever that mapping
> changes all the tools that are using it (including 3rd party ones)
> have to be rebuilt. This is not really maintainable since we don't
> control 3rd party tools and even for the ones we control, it will be a
> maintenance issue figuring out which version of the tool used which
> header file.
> Approach 2: have a centralized facility (a process or a DB)
> maintaining number->string mapping. This would require an additional
> request to this facility whenever we want to make a number->string
> conversion. Moreover, when we want to name a VMA, we would have to
> register a new VMA name in that facility or check that one already
> exists and get its ID. So each prctl() call to name a VMA will be
> preceded by such a request (IPC call), maybe with some optimizations
> to cache already known number->string pairs. This would be quite
> expensive performance-wise. Additional issue with this approach is
> that this mapping will have to be persistent to handle a case when the
> facility crashes and has to be restored.
>
> As I said before, it complicates userspace quite a bit. Is that a good
> enough reason to store the names in the kernel and pay a little more
> memory for that? IMHO yes, but I might be wrong.
FWIW, I prefer the strings. It's more human-readable, which is important
for the kinds of cases where the maps are being used for diagnostics,
etc.
--
Kees Cook
Hi!
> > On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
> > > While forking a process with high number (64K) of named anonymous vmas the
> > > overhead caused by strdup() is noticeable. Experiments with ARM64
> > Android
> >
> > I still believe you should simply use numbers and do the
> > numbers->strings mapping in userspace. We should not need to optimize
> > strdups in kernel...
>
> Here are complications with mapping numbers to strings in the userspace:
> Approach 1: hardcode number->string in some header file and let all
> tools use that mapping. The issue is that whenever that mapping
> changes all the tools that are using it (including 3rd party ones)
> have to be rebuilt. This is not really maintainable since we don't
> control 3rd party tools and even for the ones we control, it will be a
> maintenance issue figuring out which version of the tool used which
> header file.
1a) Just put it into a file in /etc... Similar to header file but
easier...
> Approach 2: have a centralized facility (a process or a DB)
> maintaining number->string mapping. This would require an additional
> request to this facility whenever we want to make a number->string
> conversion. Moreover, when we want to name a VMA, we would have to
I see it complicates userspace. But that's better than complicating
kernel, and I don't know what limits on strings you plan, but
considering you'll be outputing the strings in /proc... someone is
going to get confused with parsing.
Pavel
--
http://www.livejournal.com/~pavelmachek
On Tue, Oct 5, 2021 at 1:04 PM Pavel Machek <[email protected]> wrote:
>
> Hi!
>
> > > On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
> > > > While forking a process with high number (64K) of named anonymous vmas the
> > > > overhead caused by strdup() is noticeable. Experiments with ARM64
> > > Android
> > >
> > > I still believe you should simply use numbers and do the
> > > numbers->strings mapping in userspace. We should not need to optimize
> > > strdups in kernel...
> >
> > Here are complications with mapping numbers to strings in the userspace:
> > Approach 1: hardcode number->string in some header file and let all
> > tools use that mapping. The issue is that whenever that mapping
> > changes all the tools that are using it (including 3rd party ones)
> > have to be rebuilt. This is not really maintainable since we don't
> > control 3rd party tools and even for the ones we control, it will be a
> > maintenance issue figuring out which version of the tool used which
> > header file.
>
> 1a) Just put it into a file in /etc... Similar to header file but
> easier...
>
> > Approach 2: have a centralized facility (a process or a DB)
> > maintaining number->string mapping. This would require an additional
> > request to this facility whenever we want to make a number->string
> > conversion. Moreover, when we want to name a VMA, we would have to
>
> I see it complicates userspace. But that's better than complicating
> kernel, and I don't know what limits on strings you plan, but
> considering you'll be outputing the strings in /proc... someone is
> going to get confused with parsing.
I'm not a fan of complicating kernel but the proposed approach seems
simple enough to me. Again this is subjective, so I can't really have
a good argument here. Maybe, as Andrew suggested, I should keep it
under a separate config so that whoever does not care about this
feature pays no price for it?
On the topic of confusing the parsers, if the parser is written so
that it can't ignore new [anon:...] entry then it does not matter
whether we use strings or numbers, it will get confused either way.
Again, if we are concerned about confusing existing parsers I think
having a separate config option set to 'n' would help. This would
prevent some userspace process from naming an anonymous VMA and
causing parser confusion. OTOH on systems where parsers can handle
anon VMA names (Android) we will set that config and use the feature.
Would that address your concerns?
>
> Pavel
> --
> http://www.livejournal.com/~pavelmachek
* Suren Baghdasaryan <[email protected]> [211004 12:18]:
> On Mon, Oct 4, 2021 at 12:03 AM Rolf Eike Beer <[email protected]> wrote:
> >
> > > --- a/mm/madvise.c
> > > +++ b/mm/madvise.c
> > > @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
> > > }
> > >
> > > /*
> > > - * We can potentially split a vm area into separate
> > > - * areas, each area with its own behavior.
> > > + * Update the vm_flags on regiion of a vma, splitting it or merging it as
> > ^^
>
> Thanks! Will fix in the next version.
Since you'll be respinning for this comment, can you please point out
that the split will keep the VMA as [vma->vm_start, new_end)? That is,
__split_vma() is passed 0 for new_below. It might prove useful since
the code is being reused.
Thanks,
Liam
>
> >
> > Eike
> > --
> > Rolf Eike Beer, emlix GmbH, http://www.emlix.com
> > Fon +49 551 30664-0, Fax +49 551 30664-11
> > Gothaer Platz 3, 37083 Göttingen, Germany
> > Sitz der Gesellschaft: Göttingen, Amtsgericht Göttingen HR B 3160
> > Geschäftsführung: Heike Jordan, Dr. Uwe Kracke – Ust-IdNr.: DE 205 198 055
> >
> > emlix - smart embedded open source
> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Tue, Oct 5, 2021 at 2:00 PM Liam Howlett <[email protected]> wrote:
>
> * Suren Baghdasaryan <[email protected]> [211004 12:18]:
> > On Mon, Oct 4, 2021 at 12:03 AM Rolf Eike Beer <[email protected]> wrote:
> > >
> > > > --- a/mm/madvise.c
> > > > +++ b/mm/madvise.c
> > > > @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
> > > > }
> > > >
> > > > /*
> > > > - * We can potentially split a vm area into separate
> > > > - * areas, each area with its own behavior.
> > > > + * Update the vm_flags on regiion of a vma, splitting it or merging it as
> > > ^^
> >
> > Thanks! Will fix in the next version.
>
> Since you'll be respinning for this comment, can you please point out
> that the split will keep the VMA as [vma->vm_start, new_end)? That is,
> __split_vma() is passed 0 for new_below. It might prove useful since
> the code is being reused.
Hmm. There are two cases here:
if (start != vma->vm_start) {
...
error = __split_vma(mm, vma, start, 1);
}
and
if (end != vma->vm_end) {
...
error = __split_vma(mm, vma, end, 0);
}
so, I don't think such a comment would be completely correct, no?
>
> Thanks,
> Liam
>
> >
> > >
> > > Eike
> > > --
> > > Rolf Eike Beer, emlix GmbH, http://www.emlix.com
> > > Fon +49 551 30664-0, Fax +49 551 30664-11
> > > Gothaer Platz 3, 37083 Göttingen, Germany
> > > Sitz der Gesellschaft: Göttingen, Amtsgericht Göttingen HR B 3160
> > > Geschäftsführung: Heike Jordan, Dr. Uwe Kracke – Ust-IdNr.: DE 205 198 055
> > >
> > > emlix - smart embedded open source
> > >
> > > --
> > > To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> >
On 10/5/21 13:43, Suren Baghdasaryan wrote:
> On Tue, Oct 5, 2021 at 1:04 PM Pavel Machek <[email protected]> wrote:
>>
>> Hi!
>>
>>>> On Fri 2021-10-01 13:56:57, Suren Baghdasaryan wrote:
>>>>> While forking a process with high number (64K) of named anonymous vmas the
>>>>> overhead caused by strdup() is noticeable. Experiments with ARM64
>>>> Android
>>>>
>>>> I still believe you should simply use numbers and do the
>>>> numbers->strings mapping in userspace. We should not need to optimize
>>>> strdups in kernel...
>>>
>>> Here are complications with mapping numbers to strings in the userspace:
>>> Approach 1: hardcode number->string in some header file and let all
>>> tools use that mapping. The issue is that whenever that mapping
>>> changes all the tools that are using it (including 3rd party ones)
>>> have to be rebuilt. This is not really maintainable since we don't
>>> control 3rd party tools and even for the ones we control, it will be a
>>> maintenance issue figuring out which version of the tool used which
>>> header file.
>>
>> 1a) Just put it into a file in /etc... Similar to header file but
>> easier...
>>
>>> Approach 2: have a centralized facility (a process or a DB)
>>> maintaining number->string mapping. This would require an additional
>>> request to this facility whenever we want to make a number->string
>>> conversion. Moreover, when we want to name a VMA, we would have to
>>
>> I see it complicates userspace. But that's better than complicating
>> kernel, and I don't know what limits on strings you plan, but
>> considering you'll be outputing the strings in /proc... someone is
>> going to get confused with parsing.
>
> I'm not a fan of complicating kernel but the proposed approach seems
> simple enough to me. Again this is subjective, so I can't really have
> a good argument here. Maybe, as Andrew suggested, I should keep it
> under a separate config so that whoever does not care about this
> feature pays no price for it?
For what it's worth, I've been watching this feature proposal evolve,
and a couple of things are starting to become clear. These are of
course judgment calls, though, so even though I'm writing them as
"facts", please read them as merely "one developer's opinion and
preference":
1) Yes, just leave the strings in the kernel, that's simple and
it works, and the alternatives don't really help your case nearly
enough. The kernel changes at a different rate than distros and
user space, and keeping number->string mappings updated and correct
is just basically hopeless.
And you've beaten down the perf problems with kref, so it's fine.
2) At the same time, this feature is Just Not Needed! ...usually.
So the config option seems absolutely appropriate.
Even Pavel here will probably be content with the above mix, I
expect. :)
thanks,
--
John Hubbard
NVIDIA
On Tue 05-10-21 23:57:36, John Hubbard wrote:
[...]
> 1) Yes, just leave the strings in the kernel, that's simple and
> it works, and the alternatives don't really help your case nearly
> enough.
I do not have a strong opinion. Strings are easier to use but they
are more involved and the necessity of kref approach just underlines
that. There are going to be new allocations and that always can lead
to surprising side effects. These are small (80B at maximum) so the
overall footpring shouldn't all that large by default but it can grow
quite large with a very high max_map_count. There are workloads which
really require the default to be set high (e.g. heavy mremap users). So
if anything all those should be __GFP_ACCOUNT and memcg accounted.
I do agree that numbers are just much more simpler from accounting,
performance and implementation POV.
> The kernel changes at a different rate than distros and
> user space, and keeping number->string mappings updated and correct
> is just basically hopeless.
I am not sure I follow here. This all looks like a userspace policy. No
matter what kind of id you use. It is userspace to set and consume those
ids. Why is it different to use strings from numbers. All parties have
to agree in a naming/numbering convention anyway. Those might differ on
Android or other userspaces. What am I missing?
> And you've beaten down the perf problems with kref, so it's fine.
>
> 2) At the same time, this feature is Just Not Needed! ...usually.
> So the config option seems absolutely appropriate.
This is not really an answer. Most users are using a distro kernel so
this would need to be enabled in those configs just in case.
--
Michal Hocko
SUSE Labs
On 06.10.21 10:27, Michal Hocko wrote:
> On Tue 05-10-21 23:57:36, John Hubbard wrote:
> [...]
>> 1) Yes, just leave the strings in the kernel, that's simple and
>> it works, and the alternatives don't really help your case nearly
>> enough.
>
> I do not have a strong opinion. Strings are easier to use but they
> are more involved and the necessity of kref approach just underlines
> that. There are going to be new allocations and that always can lead
> to surprising side effects. These are small (80B at maximum) so the
> overall footpring shouldn't all that large by default but it can grow
> quite large with a very high max_map_count. There are workloads which
> really require the default to be set high (e.g. heavy mremap users). So
> if anything all those should be __GFP_ACCOUNT and memcg accounted.
>
> I do agree that numbers are just much more simpler from accounting,
> performance and implementation POV.
+1
I can understand that having a string can be quite beneficial e.g., when
dumping mmaps. If only user space knows the id <-> string mapping, that
can be quite tricky.
However, I also do wonder if there would be a way to standardize/reserve
ids, such that a given id always corresponds to a specific user. If we
use an uint64_t for an id, there would be plenty room to reserve ids ...
I'd really prefer if we can avoid using strings and instead using ids.
--
Thanks,
David / dhildenb
On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
>
> On 06.10.21 10:27, Michal Hocko wrote:
> > On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > [...]
> >> 1) Yes, just leave the strings in the kernel, that's simple and
> >> it works, and the alternatives don't really help your case nearly
> >> enough.
> >
> > I do not have a strong opinion. Strings are easier to use but they
> > are more involved and the necessity of kref approach just underlines
> > that. There are going to be new allocations and that always can lead
> > to surprising side effects. These are small (80B at maximum) so the
> > overall footpring shouldn't all that large by default but it can grow
> > quite large with a very high max_map_count. There are workloads which
> > really require the default to be set high (e.g. heavy mremap users). So
> > if anything all those should be __GFP_ACCOUNT and memcg accounted.
> >
> > I do agree that numbers are just much more simpler from accounting,
> > performance and implementation POV.
>
> +1
>
> I can understand that having a string can be quite beneficial e.g., when
> dumping mmaps. If only user space knows the id <-> string mapping, that
> can be quite tricky.
>
> However, I also do wonder if there would be a way to standardize/reserve
> ids, such that a given id always corresponds to a specific user. If we
> use an uint64_t for an id, there would be plenty room to reserve ids ...
>
> I'd really prefer if we can avoid using strings and instead using ids.
I wish it was that simple and for some names like [anon:.bss] or
[anon:dalvik-zygote space] reserving a unique id would work, however
some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
are generated dynamically at runtime and include package name.
Packages are constantly evolving, new ones are developed, names can
change, etc. So assigning a unique id for these names is not really
feasible.
That leaves us with the central facility option, which as I described
in my previous email would be prohibitive from performance POV (IPC
every time we have a new name or want to convert id to name).
I'm all for simplicity but the simple approach of using ids instead of
names unfortunately would not work for our usecases.
>
> --
> Thanks,
>
> David / dhildenb
>
On 06.10.21 17:01, Suren Baghdasaryan wrote:
> On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
>>
>> On 06.10.21 10:27, Michal Hocko wrote:
>>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
>>> [...]
>>>> 1) Yes, just leave the strings in the kernel, that's simple and
>>>> it works, and the alternatives don't really help your case nearly
>>>> enough.
>>>
>>> I do not have a strong opinion. Strings are easier to use but they
>>> are more involved and the necessity of kref approach just underlines
>>> that. There are going to be new allocations and that always can lead
>>> to surprising side effects. These are small (80B at maximum) so the
>>> overall footpring shouldn't all that large by default but it can grow
>>> quite large with a very high max_map_count. There are workloads which
>>> really require the default to be set high (e.g. heavy mremap users). So
>>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
>>>
>>> I do agree that numbers are just much more simpler from accounting,
>>> performance and implementation POV.
>>
>> +1
>>
>> I can understand that having a string can be quite beneficial e.g., when
>> dumping mmaps. If only user space knows the id <-> string mapping, that
>> can be quite tricky.
>>
>> However, I also do wonder if there would be a way to standardize/reserve
>> ids, such that a given id always corresponds to a specific user. If we
>> use an uint64_t for an id, there would be plenty room to reserve ids ...
>>
>> I'd really prefer if we can avoid using strings and instead using ids.
>
> I wish it was that simple and for some names like [anon:.bss] or
> [anon:dalvik-zygote space] reserving a unique id would work, however
> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> are generated dynamically at runtime and include package name.
Valuable information
> Packages are constantly evolving, new ones are developed, names can
> change, etc. So assigning a unique id for these names is not really
> feasible.
So, you'd actually want to generate/reserve an id for a given string at
runtime, assign that id to the VMA, and have a way to match id <->
string somehow?
That reservation service could be inside the kernel or even (better?) in
user space. The service could for example de-duplicates strings.
My question would be, if we really have to expose these strings to the
kernel, or if an id is sufficient. Sure, it would move complexity to
user space, but keeping complexity out of the kernel is usually a good idea.
--
Thanks,
David / dhildenb
On Wed, Oct 6, 2021 at 8:08 AM David Hildenbrand <[email protected]> wrote:
>
> On 06.10.21 17:01, Suren Baghdasaryan wrote:
> > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> >>
> >> On 06.10.21 10:27, Michal Hocko wrote:
> >>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
> >>> [...]
> >>>> 1) Yes, just leave the strings in the kernel, that's simple and
> >>>> it works, and the alternatives don't really help your case nearly
> >>>> enough.
> >>>
> >>> I do not have a strong opinion. Strings are easier to use but they
> >>> are more involved and the necessity of kref approach just underlines
> >>> that. There are going to be new allocations and that always can lead
> >>> to surprising side effects. These are small (80B at maximum) so the
> >>> overall footpring shouldn't all that large by default but it can grow
> >>> quite large with a very high max_map_count. There are workloads which
> >>> really require the default to be set high (e.g. heavy mremap users). So
> >>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
> >>>
> >>> I do agree that numbers are just much more simpler from accounting,
> >>> performance and implementation POV.
> >>
> >> +1
> >>
> >> I can understand that having a string can be quite beneficial e.g., when
> >> dumping mmaps. If only user space knows the id <-> string mapping, that
> >> can be quite tricky.
> >>
> >> However, I also do wonder if there would be a way to standardize/reserve
> >> ids, such that a given id always corresponds to a specific user. If we
> >> use an uint64_t for an id, there would be plenty room to reserve ids ...
> >>
> >> I'd really prefer if we can avoid using strings and instead using ids.
> >
> > I wish it was that simple and for some names like [anon:.bss] or
> > [anon:dalvik-zygote space] reserving a unique id would work, however
> > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > are generated dynamically at runtime and include package name.
>
> Valuable information
Yeah, I should have described it clearer the first time around.
>
> > Packages are constantly evolving, new ones are developed, names can
> > change, etc. So assigning a unique id for these names is not really
> > feasible.
>
> So, you'd actually want to generate/reserve an id for a given string at
> runtime, assign that id to the VMA, and have a way to match id <->
> string somehow?
If we go with ids then yes, that is what we would have to do.
> That reservation service could be inside the kernel or even (better?) in
> user space. The service could for example de-duplicates strings.
Yes but it would require an IPC call to that service potentially on
every mmap() when we want to name a mapped vma. This would be
prohibitive. Even on consumption side, instead of just dumping
/proc/$pid/maps we would have to parse the file and convert all
[anon:id] into [anon:name] with each conversion requiring an IPC call
(assuming no id->name pair caching on the client side).
> My question would be, if we really have to expose these strings to the
> kernel, or if an id is sufficient. Sure, it would move complexity to
> user space, but keeping complexity out of the kernel is usually a good idea.
My worry here is not the additional complexity on the userspace side
but the performance hit we would have to endure due to these
conversions.
> --
> Thanks,
>
> David / dhildenb
>
* Suren Baghdasaryan <[email protected]> [211005 17:31]:
> On Tue, Oct 5, 2021 at 2:00 PM Liam Howlett <[email protected]> wrote:
> >
> > * Suren Baghdasaryan <[email protected]> [211004 12:18]:
> > > On Mon, Oct 4, 2021 at 12:03 AM Rolf Eike Beer <[email protected]> wrote:
> > > >
> > > > > --- a/mm/madvise.c
> > > > > +++ b/mm/madvise.c
> > > > > @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior)
> > > > > }
> > > > >
> > > > > /*
> > > > > - * We can potentially split a vm area into separate
> > > > > - * areas, each area with its own behavior.
> > > > > + * Update the vm_flags on regiion of a vma, splitting it or merging it as
> > > > ^^
> > >
> > > Thanks! Will fix in the next version.
> >
> > Since you'll be respinning for this comment, can you please point out
> > that the split will keep the VMA as [vma->vm_start, new_end)? That is,
> > __split_vma() is passed 0 for new_below. It might prove useful since
> > the code is being reused.
>
> Hmm. There are two cases here:
>
> if (start != vma->vm_start) {
> ...
> error = __split_vma(mm, vma, start, 1);
> }
>
> and
>
> if (end != vma->vm_end) {
> ...
> error = __split_vma(mm, vma, end, 0);
> }
>
> so, I don't think such a comment would be completely correct, no?
Yes, you are correct. I'm not sure how I missed that.
Thanks,
Liam
Hi!
> > I can understand that having a string can be quite beneficial e.g., when
> > dumping mmaps. If only user space knows the id <-> string mapping, that
> > can be quite tricky.
> >
> > However, I also do wonder if there would be a way to standardize/reserve
> > ids, such that a given id always corresponds to a specific user. If we
> > use an uint64_t for an id, there would be plenty room to reserve ids ...
> >
> > I'd really prefer if we can avoid using strings and instead using ids.
>
> I wish it was that simple and for some names like [anon:.bss] or
> [anon:dalvik-zygote space] reserving a unique id would work, however
> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> are generated dynamically at runtime and include package name.
I'd be careful; if you allow special characters like that, you will
confuse some kind of parser.
> Packages are constantly evolving, new ones are developed, names can
> change, etc. So assigning a unique id for these names is not really
> feasible.
> That leaves us with the central facility option, which as I described
> in my previous email would be prohibitive from performance POV (IPC
> every time we have a new name or want to convert id to name).
That "central facility" option can be as simple as "mkdir
/somewhere/sanitized_id", using inode numbers for example. You don't
really need IPC.
Plus, I don't really believe the IPC cost would be prohibitive.
Or you could simply hash the string and use the hash as id...
Best regards,
Pavel
--
http://www.livejournal.com/~pavelmachek
On Wed, Oct 6, 2021 at 10:58 AM Pavel Machek <[email protected]> wrote:
>
> Hi!
>
> > > I can understand that having a string can be quite beneficial e.g., when
> > > dumping mmaps. If only user space knows the id <-> string mapping, that
> > > can be quite tricky.
> > >
> > > However, I also do wonder if there would be a way to standardize/reserve
> > > ids, such that a given id always corresponds to a specific user. If we
> > > use an uint64_t for an id, there would be plenty room to reserve ids ...
> > >
> > > I'd really prefer if we can avoid using strings and instead using ids.
> >
> > I wish it was that simple and for some names like [anon:.bss] or
> > [anon:dalvik-zygote space] reserving a unique id would work, however
> > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > are generated dynamically at runtime and include package name.
>
> I'd be careful; if you allow special characters like that, you will
> confuse some kind of parser.
That's why I think having a separate config option with default being
disabled would be useful here. It can be enabled on the systems which
handle [anon:...] correctly without affecting all other systems.
>
> > Packages are constantly evolving, new ones are developed, names can
> > change, etc. So assigning a unique id for these names is not really
> > feasible.
> > That leaves us with the central facility option, which as I described
> > in my previous email would be prohibitive from performance POV (IPC
> > every time we have a new name or want to convert id to name).
>
> That "central facility" option can be as simple as "mkdir
> /somewhere/sanitized_id", using inode numbers for example. You don't
> really need IPC.
Hmm, so the suggestion is to have some directory which contains files
representing IDs, each containing the string name of the associated
vma? Then let's say we are creating a new VMA and want to name it. We
would have to scan that directory, check all files and see if any of
them contain the name we want to reuse the same ID. This is while we
are doing mmap() call, which is often a part of time sensitive
operation (for example app is starting and allocating some memory to
operate). App startup time is one of the main metrics our users care
about and regressing that would not go well with our QA team.
>
> Plus, I don't really believe the IPC cost would be prohibitive.
This option was discussed by the Android performance team and rejected
8 years ago. The problem is that these operations are often happening
in performance-sensitive areas of the process lifecycle.
>
> Or you could simply hash the string and use the hash as id...
I don't see how this would help. You still need to register your
hash->name association somewhere for that hash to be converted back to
the name. Did I misunderstand your suggestion?
>
> Best regards,
> Pavel
> --
> http://www.livejournal.com/~pavelmachek
On Wed, 6 Oct 2021 08:20:20 -0700 Suren Baghdasaryan <[email protected]> wrote:
> On Wed, Oct 6, 2021 at 8:08 AM David Hildenbrand <[email protected]> wrote:
> >
> > On 06.10.21 17:01, Suren Baghdasaryan wrote:
> > > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> > >>
> > >> On 06.10.21 10:27, Michal Hocko wrote:
> > >>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > >>> [...]
> > >>>> 1) Yes, just leave the strings in the kernel, that's simple and
> > >>>> it works, and the alternatives don't really help your case nearly
> > >>>> enough.
> > >>>
> > >>> I do not have a strong opinion. Strings are easier to use but they
> > >>> are more involved and the necessity of kref approach just underlines
> > >>> that. There are going to be new allocations and that always can lead
> > >>> to surprising side effects. These are small (80B at maximum) so the
> > >>> overall footpring shouldn't all that large by default but it can grow
> > >>> quite large with a very high max_map_count. There are workloads which
> > >>> really require the default to be set high (e.g. heavy mremap users). So
> > >>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > >>>
> > >>> I do agree that numbers are just much more simpler from accounting,
> > >>> performance and implementation POV.
> > >>
> > >> +1
> > >>
> > >> I can understand that having a string can be quite beneficial e.g., when
> > >> dumping mmaps. If only user space knows the id <-> string mapping, that
> > >> can be quite tricky.
> > >>
> > >> However, I also do wonder if there would be a way to standardize/reserve
> > >> ids, such that a given id always corresponds to a specific user. If we
> > >> use an uint64_t for an id, there would be plenty room to reserve ids ...
> > >>
> > >> I'd really prefer if we can avoid using strings and instead using ids.
> > >
> > > I wish it was that simple and for some names like [anon:.bss] or
> > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > are generated dynamically at runtime and include package name.
> >
> > Valuable information
>
> Yeah, I should have described it clearer the first time around.
If it gets this fancy then the 80 char limit is likely to become a
significant limitation and the choice should be explained & justified.
Why not 97? 1034? Why not just strndup_user() and be done with it?
> > My question would be, if we really have to expose these strings to the
> > kernel, or if an id is sufficient. Sure, it would move complexity to
> > user space, but keeping complexity out of the kernel is usually a good idea.
>
> My worry here is not the additional complexity on the userspace side
> but the performance hit we would have to endure due to these
> conversions.
Has the performance hit been quantified?
I've seen this many times down the ages. Something which *could* be
done in userspace is instead done in the kernel because coordinating
userspace is Just So Damn Hard. I guess the central problem is that
userspace isn't centrally coordinated. I wish we were better at this.
On Mon, 4 Oct 2021 09:21:42 -0700 Suren Baghdasaryan <[email protected]> wrote:
> > > > The name pointers are not shared between vmas even if they contain the
> > > > same name. The name pointer is stored in a union with fields that are
> > > > only used on file-backed mappings, so it does not increase memory usage.
> > > >
> > > > The patch is based on the original patch developed by Colin Cross, more
> > > > specifically on its latest version [1] posted upstream by Sumit Semwal.
> > > > It used a userspace pointer to store vma names. In that design, name
> > > > pointers could be shared between vmas. However during the last upstreaming
> > > > attempt, Kees Cook raised concerns [2] about this approach and suggested
> > > > to copy the name into kernel memory space, perform validity checks [3]
> > > > and store as a string referenced from vm_area_struct.
> > > > One big concern is about fork() performance which would need to strdup
> > > > anonymous vma names. Dave Hansen suggested experimenting with worst-case
> > > > scenario of forking a process with 64k vmas having longest possible names
> > > > [4]. I ran this experiment on an ARM64 Android device and recorded a
> > > > worst-case regression of almost 40% when forking such a process. This
> > > > regression is addressed in the followup patch which replaces the pointer
> > > > to a name with a refcounted structure that allows sharing the name pointer
> > > > between vmas of the same name. Instead of duplicating the string during
> > > > fork() or when splitting a vma it increments the refcount.
> > >
> > > Generally, the patch adds a bunch of code which a lot of users won't
> > > want. Did we bust a gut to reduce this impact? Was a standalone
> > > config setting considered?
> >
> > I didn't consider a standalone config for this feature because when
> > not used it has no memory impact at runtime. As for the image size, I
> > built Linus' ToT with and without this patchset with allmodconfig and
allnoconfig would be more interesting. People who want small kernels
won't be using allmodconfig!
> > the sizes are:
> > Without the patchset:
> > $ size vmlinux
> > text data bss dec hex filename
> > 40763556 58424519 29016228 128204303 7a43e0f vmlinux
> >
> > With the patchset:
> > $ size vmlinux
> > text data bss dec hex filename
> > 40765068 58424671 29016228 128205967 7a4448f vmlinux
> >
> > The increase seems quite small, so I'm not sure if it warrants a
> > separate config option.
>
> Andrew, do you still think we need a separate CONFIG option? I fixed
> the build issue when CONFIG_ADVISE_SYSCALLS=n and would like to post
> the update but if you want to have a separate config then I can post
> that together with the fix. Please let me know.
I don't see much downside to the standalone option. More complexity
for developers/testers, I guess. But such is life?
On Wed, Oct 6, 2021 at 7:29 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 6 Oct 2021 08:20:20 -0700 Suren Baghdasaryan <[email protected]> wrote:
>
> > On Wed, Oct 6, 2021 at 8:08 AM David Hildenbrand <[email protected]> wrote:
> > >
> > > On 06.10.21 17:01, Suren Baghdasaryan wrote:
> > > > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> > > >>
> > > >> On 06.10.21 10:27, Michal Hocko wrote:
> > > >>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > > >>> [...]
> > > >>>> 1) Yes, just leave the strings in the kernel, that's simple and
> > > >>>> it works, and the alternatives don't really help your case nearly
> > > >>>> enough.
> > > >>>
> > > >>> I do not have a strong opinion. Strings are easier to use but they
> > > >>> are more involved and the necessity of kref approach just underlines
> > > >>> that. There are going to be new allocations and that always can lead
> > > >>> to surprising side effects. These are small (80B at maximum) so the
> > > >>> overall footpring shouldn't all that large by default but it can grow
> > > >>> quite large with a very high max_map_count. There are workloads which
> > > >>> really require the default to be set high (e.g. heavy mremap users). So
> > > >>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > > >>>
> > > >>> I do agree that numbers are just much more simpler from accounting,
> > > >>> performance and implementation POV.
> > > >>
> > > >> +1
> > > >>
> > > >> I can understand that having a string can be quite beneficial e.g., when
> > > >> dumping mmaps. If only user space knows the id <-> string mapping, that
> > > >> can be quite tricky.
> > > >>
> > > >> However, I also do wonder if there would be a way to standardize/reserve
> > > >> ids, such that a given id always corresponds to a specific user. If we
> > > >> use an uint64_t for an id, there would be plenty room to reserve ids ...
> > > >>
> > > >> I'd really prefer if we can avoid using strings and instead using ids.
> > > >
> > > > I wish it was that simple and for some names like [anon:.bss] or
> > > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > > are generated dynamically at runtime and include package name.
> > >
> > > Valuable information
> >
> > Yeah, I should have described it clearer the first time around.
>
> If it gets this fancy then the 80 char limit is likely to become a
> significant limitation and the choice should be explained & justified.
>
> Why not 97? 1034? Why not just strndup_user() and be done with it?
The original patch from 8 years ago used 256 as the limit but Rasmus
argued that the string content should be human-readable, so 80 chars
seems to be a reasonable limit (see:
https://lore.kernel.org/all/[email protected]),
which makes sense to me. We should be able to handle the 80 char limit
by trimming it before calling prctl().
>
> > > My question would be, if we really have to expose these strings to the
> > > kernel, or if an id is sufficient. Sure, it would move complexity to
> > > user space, but keeping complexity out of the kernel is usually a good idea.
> >
> > My worry here is not the additional complexity on the userspace side
> > but the performance hit we would have to endure due to these
> > conversions.
>
> Has the performance hit been quantified?
I'll try to get the data that was collected or at least an estimate. I
imagine collecting such data would require considerable userspace
redesign.
> I've seen this many times down the ages. Something which *could* be
> done in userspace is instead done in the kernel because coordinating
> userspace is Just So Damn Hard. I guess the central problem is that
> userspace isn't centrally coordinated. I wish we were better at this.
It's not just hard, it's also inefficient. And for our usecase
performance is important.
>
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Wed, Oct 6, 2021 at 7:39 PM Andrew Morton <[email protected]> wrote:
>
> On Mon, 4 Oct 2021 09:21:42 -0700 Suren Baghdasaryan <[email protected]> wrote:
>
> > > > > The name pointers are not shared between vmas even if they contain the
> > > > > same name. The name pointer is stored in a union with fields that are
> > > > > only used on file-backed mappings, so it does not increase memory usage.
> > > > >
> > > > > The patch is based on the original patch developed by Colin Cross, more
> > > > > specifically on its latest version [1] posted upstream by Sumit Semwal.
> > > > > It used a userspace pointer to store vma names. In that design, name
> > > > > pointers could be shared between vmas. However during the last upstreaming
> > > > > attempt, Kees Cook raised concerns [2] about this approach and suggested
> > > > > to copy the name into kernel memory space, perform validity checks [3]
> > > > > and store as a string referenced from vm_area_struct.
> > > > > One big concern is about fork() performance which would need to strdup
> > > > > anonymous vma names. Dave Hansen suggested experimenting with worst-case
> > > > > scenario of forking a process with 64k vmas having longest possible names
> > > > > [4]. I ran this experiment on an ARM64 Android device and recorded a
> > > > > worst-case regression of almost 40% when forking such a process. This
> > > > > regression is addressed in the followup patch which replaces the pointer
> > > > > to a name with a refcounted structure that allows sharing the name pointer
> > > > > between vmas of the same name. Instead of duplicating the string during
> > > > > fork() or when splitting a vma it increments the refcount.
> > > >
> > > > Generally, the patch adds a bunch of code which a lot of users won't
> > > > want. Did we bust a gut to reduce this impact? Was a standalone
> > > > config setting considered?
> > >
> > > I didn't consider a standalone config for this feature because when
> > > not used it has no memory impact at runtime. As for the image size, I
> > > built Linus' ToT with and without this patchset with allmodconfig and
>
> allnoconfig would be more interesting. People who want small kernels
> won't be using allmodconfig!
Sure, I will check that and report back.
>
> > > the sizes are:
> > > Without the patchset:
> > > $ size vmlinux
> > > text data bss dec hex filename
> > > 40763556 58424519 29016228 128204303 7a43e0f vmlinux
> > >
> > > With the patchset:
> > > $ size vmlinux
> > > text data bss dec hex filename
> > > 40765068 58424671 29016228 128205967 7a4448f vmlinux
> > >
> > > The increase seems quite small, so I'm not sure if it warrants a
> > > separate config option.
> >
> > Andrew, do you still think we need a separate CONFIG option? I fixed
> > the build issue when CONFIG_ADVISE_SYSCALLS=n and would like to post
> > the update but if you want to have a separate config then I can post
> > that together with the fix. Please let me know.
>
> I don't see much downside to the standalone option. More complexity
> for developers/testers, I guess. But such is life?
Sounds good to me. I will post a new version with a separate config if
we get over the objections of using numbers instead of strings.
Thanks!
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Wed, 6 Oct 2021 19:46:57 -0700 Suren Baghdasaryan <[email protected]> wrote:
> > > > > I wish it was that simple and for some names like [anon:.bss] or
> > > > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > > > are generated dynamically at runtime and include package name.
> > > >
> > > > Valuable information
> > >
> > > Yeah, I should have described it clearer the first time around.
> >
> > If it gets this fancy then the 80 char limit is likely to become a
> > significant limitation and the choice should be explained & justified.
> >
> > Why not 97? 1034? Why not just strndup_user() and be done with it?
>
> The original patch from 8 years ago used 256 as the limit but Rasmus
> argued that the string content should be human-readable, so 80 chars
> seems to be a reasonable limit (see:
> https://lore.kernel.org/all/[email protected]),
> which makes sense to me. We should be able to handle the 80 char limit
> by trimming it before calling prctl().
What's the downside to making it unlimited?
On Wed, Oct 6, 2021 at 7:53 PM Andrew Morton <[email protected]> wrote:
>
> On Wed, 6 Oct 2021 19:46:57 -0700 Suren Baghdasaryan <[email protected]> wrote:
>
> > > > > > I wish it was that simple and for some names like [anon:.bss] or
> > > > > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > > > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > > > > are generated dynamically at runtime and include package name.
> > > > >
> > > > > Valuable information
> > > >
> > > > Yeah, I should have described it clearer the first time around.
> > >
> > > If it gets this fancy then the 80 char limit is likely to become a
> > > significant limitation and the choice should be explained & justified.
> > >
> > > Why not 97? 1034? Why not just strndup_user() and be done with it?
> >
> > The original patch from 8 years ago used 256 as the limit but Rasmus
> > argued that the string content should be human-readable, so 80 chars
> > seems to be a reasonable limit (see:
> > https://lore.kernel.org/all/[email protected]),
> > which makes sense to me. We should be able to handle the 80 char limit
> > by trimming it before calling prctl().
>
> What's the downside to making it unlimited?
If we ignore the human-readability argument, I guess the possibility
of abuse and increased memory consumption? I'm guessing parsing such a
string is also easier if there is a known limit?
>
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On 06.10.21 17:20, Suren Baghdasaryan wrote:
> On Wed, Oct 6, 2021 at 8:08 AM David Hildenbrand <[email protected]> wrote:
>>
>> On 06.10.21 17:01, Suren Baghdasaryan wrote:
>>> On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
>>>>
>>>> On 06.10.21 10:27, Michal Hocko wrote:
>>>>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
>>>>> [...]
>>>>>> 1) Yes, just leave the strings in the kernel, that's simple and
>>>>>> it works, and the alternatives don't really help your case nearly
>>>>>> enough.
>>>>>
>>>>> I do not have a strong opinion. Strings are easier to use but they
>>>>> are more involved and the necessity of kref approach just underlines
>>>>> that. There are going to be new allocations and that always can lead
>>>>> to surprising side effects. These are small (80B at maximum) so the
>>>>> overall footpring shouldn't all that large by default but it can grow
>>>>> quite large with a very high max_map_count. There are workloads which
>>>>> really require the default to be set high (e.g. heavy mremap users). So
>>>>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
>>>>>
>>>>> I do agree that numbers are just much more simpler from accounting,
>>>>> performance and implementation POV.
>>>>
>>>> +1
>>>>
>>>> I can understand that having a string can be quite beneficial e.g., when
>>>> dumping mmaps. If only user space knows the id <-> string mapping, that
>>>> can be quite tricky.
>>>>
>>>> However, I also do wonder if there would be a way to standardize/reserve
>>>> ids, such that a given id always corresponds to a specific user. If we
>>>> use an uint64_t for an id, there would be plenty room to reserve ids ...
>>>>
>>>> I'd really prefer if we can avoid using strings and instead using ids.
>>>
>>> I wish it was that simple and for some names like [anon:.bss] or
>>> [anon:dalvik-zygote space] reserving a unique id would work, however
>>> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
>>> are generated dynamically at runtime and include package name.
>>
>> Valuable information
>
> Yeah, I should have described it clearer the first time around.
>
>>
>>> Packages are constantly evolving, new ones are developed, names can
>>> change, etc. So assigning a unique id for these names is not really
>>> feasible.
>>
>> So, you'd actually want to generate/reserve an id for a given string at
>> runtime, assign that id to the VMA, and have a way to match id <->
>> string somehow?
>
> If we go with ids then yes, that is what we would have to do.
>
>> That reservation service could be inside the kernel or even (better?) in
>> user space. The service could for example de-duplicates strings.
>
> Yes but it would require an IPC call to that service potentially on
> every mmap() when we want to name a mapped vma. This would be
> prohibitive. Even on consumption side, instead of just dumping
> /proc/$pid/maps we would have to parse the file and convert all
> [anon:id] into [anon:name] with each conversion requiring an IPC call
> (assuming no id->name pair caching on the client side).
mmap() and prctl() already do take the mmap sem in write, so they are
not the "most lightweight" operations so to say.
We already to have two separate operations, first the mmap(), then the
prctl(). IMHO you could defer the "naming" part to a later point in
time, without creating too many issues, moving it out of the
"hot/performance critical phase"
Reading https://lwn.net/Articles/867818/, to me it feels like the use
case could live with a little larger delay between the mmap popping up
and a name getting assigned.
--
Thanks,
David / dhildenb
On 07.10.21 05:01, Suren Baghdasaryan wrote:
> On Wed, Oct 6, 2021 at 7:53 PM Andrew Morton <[email protected]> wrote:
>>
>> On Wed, 6 Oct 2021 19:46:57 -0700 Suren Baghdasaryan <[email protected]> wrote:
>>
>>>>>>> I wish it was that simple and for some names like [anon:.bss] or
>>>>>>> [anon:dalvik-zygote space] reserving a unique id would work, however
>>>>>>> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
>>>>>>> are generated dynamically at runtime and include package name.
>>>>>>
>>>>>> Valuable information
>>>>>
>>>>> Yeah, I should have described it clearer the first time around.
>>>>
>>>> If it gets this fancy then the 80 char limit is likely to become a
>>>> significant limitation and the choice should be explained & justified.
>>>>
>>>> Why not 97? 1034? Why not just strndup_user() and be done with it?
>>>
>>> The original patch from 8 years ago used 256 as the limit but Rasmus
>>> argued that the string content should be human-readable, so 80 chars
>>> seems to be a reasonable limit (see:
>>> https://lore.kernel.org/all/[email protected]),
>>> which makes sense to me. We should be able to handle the 80 char limit
>>> by trimming it before calling prctl().
>>
>> What's the downside to making it unlimited?
>
> If we ignore the human-readability argument, I guess the possibility
> of abuse and increased memory consumption? I'm guessing parsing such a
> string is also easier if there is a known limit?
64k * 80 bytes already makes me nervous enough :)
--
Thanks,
David / dhildenb
On Wed 06-10-21 08:01:56, Suren Baghdasaryan wrote:
> On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> >
> > On 06.10.21 10:27, Michal Hocko wrote:
> > > On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > > [...]
> > >> 1) Yes, just leave the strings in the kernel, that's simple and
> > >> it works, and the alternatives don't really help your case nearly
> > >> enough.
> > >
> > > I do not have a strong opinion. Strings are easier to use but they
> > > are more involved and the necessity of kref approach just underlines
> > > that. There are going to be new allocations and that always can lead
> > > to surprising side effects. These are small (80B at maximum) so the
> > > overall footpring shouldn't all that large by default but it can grow
> > > quite large with a very high max_map_count. There are workloads which
> > > really require the default to be set high (e.g. heavy mremap users). So
> > > if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > >
> > > I do agree that numbers are just much more simpler from accounting,
> > > performance and implementation POV.
> >
> > +1
> >
> > I can understand that having a string can be quite beneficial e.g., when
> > dumping mmaps. If only user space knows the id <-> string mapping, that
> > can be quite tricky.
> >
> > However, I also do wonder if there would be a way to standardize/reserve
> > ids, such that a given id always corresponds to a specific user. If we
> > use an uint64_t for an id, there would be plenty room to reserve ids ...
> >
> > I'd really prefer if we can avoid using strings and instead using ids.
>
> I wish it was that simple and for some names like [anon:.bss] or
> [anon:dalvik-zygote space] reserving a unique id would work, however
> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> are generated dynamically at runtime and include package name.
> Packages are constantly evolving, new ones are developed, names can
> change, etc. So assigning a unique id for these names is not really
> feasible.
I still do not follow. If you need a globaly consistent naming then
you need clear rules for that, no matter whether that is number or a
file. How do you handle this with strings currently?
--
Michal Hocko
SUSE Labs
Hi!
> [...]
> > > That "central facility" option can be as simple as "mkdir
> > > /somewhere/sanitized_id", using inode numbers for example. You don't
> > > really need IPC.
> >
> > Hmm, so the suggestion is to have some directory which contains files
> > representing IDs, each containing the string name of the associated
> > vma? Then let's say we are creating a new VMA and want to name it. We
> > would have to scan that directory, check all files and see if any of
> > them contain the name we want to reuse the same ID.
>
> I believe Pavel meant something as simple as
> $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> $ touch $YOUR_FILE
> $ stat -c %i $YOUR_FILE
>
> YOUR_IDS_DIR can live on a tmpfs and you can even implement a policy on
> top of that (who can generate new ids, gurantee uniqness etc...).
>
> The above is certainly not for free of course but if you really need a
> system wide consistency when using names then you need some sort of
> central authority. How you implement that is not all that important
> but I do not think we want to handle that in the kernel.
For the record, yes, that is what I meant.
Thanks,
Pavel
--
http://www.livejournal.com/~pavelmachek
On Wed 06-10-21 11:18:31, Suren Baghdasaryan wrote:
> On Wed, Oct 6, 2021 at 10:58 AM Pavel Machek <[email protected]> wrote:
[...]
> > That "central facility" option can be as simple as "mkdir
> > /somewhere/sanitized_id", using inode numbers for example. You don't
> > really need IPC.
>
> Hmm, so the suggestion is to have some directory which contains files
> representing IDs, each containing the string name of the associated
> vma? Then let's say we are creating a new VMA and want to name it. We
> would have to scan that directory, check all files and see if any of
> them contain the name we want to reuse the same ID.
I believe Pavel meant something as simple as
$ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
$ touch $YOUR_FILE
$ stat -c %i $YOUR_FILE
YOUR_IDS_DIR can live on a tmpfs and you can even implement a policy on
top of that (who can generate new ids, gurantee uniqness etc...).
The above is certainly not for free of course but if you really need a
system wide consistency when using names then you need some sort of
central authority. How you implement that is not all that important
but I do not think we want to handle that in the kernel.
--
Michal Hocko
SUSE Labs
On 07/10/2021 10.10, Michal Hocko wrote:
> On Wed 06-10-21 11:18:31, Suren Baghdasaryan wrote:
>> On Wed, Oct 6, 2021 at 10:58 AM Pavel Machek <[email protected]> wrote:
> [...]
>>> That "central facility" option can be as simple as "mkdir
>>> /somewhere/sanitized_id", using inode numbers for example. You don't
>>> really need IPC.
>>
>> Hmm, so the suggestion is to have some directory which contains files
>> representing IDs, each containing the string name of the associated
>> vma? Then let's say we are creating a new VMA and want to name it. We
>> would have to scan that directory, check all files and see if any of
>> them contain the name we want to reuse the same ID.
>
> I believe Pavel meant something as simple as
> $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> $ touch $YOUR_FILE
> $ stat -c %i $YOUR_FILE
So in terms of syscall overhead, that would be open(..., O_CREAT |
O_CLOEXEC), fstat(), close() - or one could optimistically start by
doing a single lstat() if it is normal that the name is already created
(which I assume).
As for the consumer, one can't directly map an inode number to a dentry,
but whoever first creates the name->id mapping could also be responsible
for doing a "sprintf(buf, "/id/to/name/%016lx", id); symlink(name,
buf)". And if one did the optimistic lstat() succesfully, one would know
that someone else created the file and thus the symlink. And since the
operations are idempotent, the obvious races are irrelevant.
Then the consumer would only need to do a readlink() to get the name.
But that would only be for presentation to a human. Internally all the
aggregation based on the type of anon mem the tool might as well do in
terms of the integer id.
> YOUR_IDS_DIR can live on a tmpfs and you can even implement a policy on
> top of that (who can generate new ids, gurantee uniqness etc...).
>
> The above is certainly not for free of course but if you really need a
> system wide consistency when using names then you need some sort of
> central authority. How you implement that is not all that important
> but I do not think we want to handle that in the kernel.
IDK. If the whole thing could be put behind a CONFIG_ knob, with _zero_
overhead when not enabled (and I'm a bit worried about all the functions
that grow an extra argument that gets passed around), I don't mind the
string interface. But I don't really have a say either way.
Rasmus
Hi!
> >> Hmm, so the suggestion is to have some directory which contains files
> >> representing IDs, each containing the string name of the associated
> >> vma? Then let's say we are creating a new VMA and want to name it. We
> >> would have to scan that directory, check all files and see if any of
> >> them contain the name we want to reuse the same ID.
> >
> > I believe Pavel meant something as simple as
> > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > $ touch $YOUR_FILE
> > $ stat -c %i $YOUR_FILE
>
> So in terms of syscall overhead, that would be open(..., O_CREAT |
> O_CLOEXEC), fstat(), close() - or one could optimistically start by
You could get to two if you used mkdir instead of open.
> > YOUR_IDS_DIR can live on a tmpfs and you can even implement a policy on
> > top of that (who can generate new ids, gurantee uniqness etc...).
> >
> > The above is certainly not for free of course but if you really need a
> > system wide consistency when using names then you need some sort of
> > central authority. How you implement that is not all that important
> > but I do not think we want to handle that in the kernel.
>
> IDK. If the whole thing could be put behind a CONFIG_ knob, with _zero_
> overhead when not enabled (and I'm a bit worried about all the functions
> that grow an extra argument that gets passed around), I don't mind the
> string interface. But I don't really have a say either way.
If this is ever useful outside of Android, eventually distros will
have it enabled.
Best regards,
Pavel
--
http://www.livejournal.com/~pavelmachek
On Thu, Oct 7, 2021 at 12:33 AM David Hildenbrand <[email protected]> wrote:
>
> On 06.10.21 17:20, Suren Baghdasaryan wrote:
> > On Wed, Oct 6, 2021 at 8:08 AM David Hildenbrand <[email protected]> wrote:
> >>
> >> On 06.10.21 17:01, Suren Baghdasaryan wrote:
> >>> On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> >>>>
> >>>> On 06.10.21 10:27, Michal Hocko wrote:
> >>>>> On Tue 05-10-21 23:57:36, John Hubbard wrote:
> >>>>> [...]
> >>>>>> 1) Yes, just leave the strings in the kernel, that's simple and
> >>>>>> it works, and the alternatives don't really help your case nearly
> >>>>>> enough.
> >>>>>
> >>>>> I do not have a strong opinion. Strings are easier to use but they
> >>>>> are more involved and the necessity of kref approach just underlines
> >>>>> that. There are going to be new allocations and that always can lead
> >>>>> to surprising side effects. These are small (80B at maximum) so the
> >>>>> overall footpring shouldn't all that large by default but it can grow
> >>>>> quite large with a very high max_map_count. There are workloads which
> >>>>> really require the default to be set high (e.g. heavy mremap users). So
> >>>>> if anything all those should be __GFP_ACCOUNT and memcg accounted.
> >>>>>
> >>>>> I do agree that numbers are just much more simpler from accounting,
> >>>>> performance and implementation POV.
> >>>>
> >>>> +1
> >>>>
> >>>> I can understand that having a string can be quite beneficial e.g., when
> >>>> dumping mmaps. If only user space knows the id <-> string mapping, that
> >>>> can be quite tricky.
> >>>>
> >>>> However, I also do wonder if there would be a way to standardize/reserve
> >>>> ids, such that a given id always corresponds to a specific user. If we
> >>>> use an uint64_t for an id, there would be plenty room to reserve ids ...
> >>>>
> >>>> I'd really prefer if we can avoid using strings and instead using ids.
> >>>
> >>> I wish it was that simple and for some names like [anon:.bss] or
> >>> [anon:dalvik-zygote space] reserving a unique id would work, however
> >>> some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> >>> are generated dynamically at runtime and include package name.
> >>
> >> Valuable information
> >
> > Yeah, I should have described it clearer the first time around.
> >
> >>
> >>> Packages are constantly evolving, new ones are developed, names can
> >>> change, etc. So assigning a unique id for these names is not really
> >>> feasible.
> >>
> >> So, you'd actually want to generate/reserve an id for a given string at
> >> runtime, assign that id to the VMA, and have a way to match id <->
> >> string somehow?
> >
> > If we go with ids then yes, that is what we would have to do.
> >
> >> That reservation service could be inside the kernel or even (better?) in
> >> user space. The service could for example de-duplicates strings.
> >
> > Yes but it would require an IPC call to that service potentially on
> > every mmap() when we want to name a mapped vma. This would be
> > prohibitive. Even on consumption side, instead of just dumping
> > /proc/$pid/maps we would have to parse the file and convert all
> > [anon:id] into [anon:name] with each conversion requiring an IPC call
> > (assuming no id->name pair caching on the client side).
>
> mmap() and prctl() already do take the mmap sem in write, so they are
> not the "most lightweight" operations so to say.
>
> We already to have two separate operations, first the mmap(), then the
> prctl(). IMHO you could defer the "naming" part to a later point in
> time, without creating too many issues, moving it out of the
> "hot/performance critical phase"
>
> Reading https://lwn.net/Articles/867818/, to me it feels like the use
> case could live with a little larger delay between the mmap popping up
> and a name getting assigned.
That might be doable if occasional inconsistency can be tolerated (we
can't guarantee that maps won't be read before the deferred work name
the vma). However I would prefer an efficient solution vs the one
which is inefficient but can be deferred.
>
> --
> Thanks,
>
> David / dhildenb
>
On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
>
> Hi!
>
> > >> Hmm, so the suggestion is to have some directory which contains files
> > >> representing IDs, each containing the string name of the associated
> > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > >> would have to scan that directory, check all files and see if any of
> > >> them contain the name we want to reuse the same ID.
> > >
> > > I believe Pavel meant something as simple as
> > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > $ touch $YOUR_FILE
> > > $ stat -c %i $YOUR_FILE
Ah, ok, now I understand the proposal. Thanks for the clarification!
So, this would use filesystem as a directory for inode->name mappings.
One rough edge for me is that the consumer would still need to parse
/proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
just dumping the content for the user. Would it be acceptable if we
require the ID provided by prctl() to always be a valid inode and
show_map_vma() would do the inode-to-filename conversion when
generating maps/smaps files? I know that inode->dentry is not
one-to-one mapping but we can simply output the first dentry name.
WDYT?
> >
> > So in terms of syscall overhead, that would be open(..., O_CREAT |
> > O_CLOEXEC), fstat(), close() - or one could optimistically start by
>
> You could get to two if you used mkdir instead of open.
>
> > > YOUR_IDS_DIR can live on a tmpfs and you can even implement a policy on
> > > top of that (who can generate new ids, gurantee uniqness etc...).
> > >
> > > The above is certainly not for free of course but if you really need a
> > > system wide consistency when using names then you need some sort of
> > > central authority. How you implement that is not all that important
> > > but I do not think we want to handle that in the kernel.
Ideally it would be great if $YOUR_IDS_DIR/my_string_name entries
could be generated by the kernel in response to userspace calling
prctl(..., name) but I haven't looked into complexity of doing that,
so I would not propose that at this point.
Thanks for sharing the ideas!
Suren.
> >
> > IDK. If the whole thing could be put behind a CONFIG_ knob, with _zero_
> > overhead when not enabled (and I'm a bit worried about all the functions
> > that grow an extra argument that gets passed around), I don't mind the
> > string interface. But I don't really have a say either way.
>
> If this is ever useful outside of Android, eventually distros will
> have it enabled.
>
> Best regards,
> Pavel
> --
> http://www.livejournal.com/~pavelmachek
On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
>
> On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > >
> > > Hi!
> > >
> > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > >> representing IDs, each containing the string name of the associated
> > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > >> would have to scan that directory, check all files and see if any of
> > > > >> them contain the name we want to reuse the same ID.
> > > > >
> > > > > I believe Pavel meant something as simple as
> > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > $ touch $YOUR_FILE
> > > > > $ stat -c %i $YOUR_FILE
> >
> > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > So, this would use filesystem as a directory for inode->name mappings.
> > One rough edge for me is that the consumer would still need to parse
> > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > just dumping the content for the user. Would it be acceptable if we
> > require the ID provided by prctl() to always be a valid inode and
> > show_map_vma() would do the inode-to-filename conversion when
> > generating maps/smaps files? I know that inode->dentry is not
> > one-to-one mapping but we can simply output the first dentry name.
> > WDYT?
>
> No. You do not want to dictate any particular way of the mapping. The
> above is just one way to do that without developing any actual mapping
> yourself. You just use a filesystem for that. Kernel doesn't and
> shouldn't understand the meaning of those numbers. It has no business in
> that.
>
> In a way this would be pushing policy into the kernel.
I can see your point. Any other ideas on how to prevent tools from
doing this id-to-name conversion themselves?
Aside from the obvious inefficiency of requiring tools to parse
maps/smaps and convert ids to names it also creates a tool->system
dependency. Tools would have to know how the system interprets these
IDs and on the systems where the ID is an inode they would have to
know where the soflinks proposed by Rasmus reside and convert them. If
I'm a tool developer who wants the tool to work on multiple systems
this becomes quite messy.
>
> --
> Michal Hocko
> SUSE Labs
On Thu, Oct 7, 2021 at 10:31 AM Michal Hocko <[email protected]> wrote:
>
> On Thu 07-10-21 09:58:02, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > > > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > > > >
> > > > > Hi!
> > > > >
> > > > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > > > >> representing IDs, each containing the string name of the associated
> > > > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > > > >> would have to scan that directory, check all files and see if any of
> > > > > > >> them contain the name we want to reuse the same ID.
> > > > > > >
> > > > > > > I believe Pavel meant something as simple as
> > > > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > > > $ touch $YOUR_FILE
> > > > > > > $ stat -c %i $YOUR_FILE
> > > >
> > > > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > > > So, this would use filesystem as a directory for inode->name mappings.
> > > > One rough edge for me is that the consumer would still need to parse
> > > > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > > > just dumping the content for the user. Would it be acceptable if we
> > > > require the ID provided by prctl() to always be a valid inode and
> > > > show_map_vma() would do the inode-to-filename conversion when
> > > > generating maps/smaps files? I know that inode->dentry is not
> > > > one-to-one mapping but we can simply output the first dentry name.
> > > > WDYT?
> > >
> > > No. You do not want to dictate any particular way of the mapping. The
> > > above is just one way to do that without developing any actual mapping
> > > yourself. You just use a filesystem for that. Kernel doesn't and
> > > shouldn't understand the meaning of those numbers. It has no business in
> > > that.
> > >
> > > In a way this would be pushing policy into the kernel.
> >
> > I can see your point. Any other ideas on how to prevent tools from
> > doing this id-to-name conversion themselves?
>
> I really fail to understand why you really want to prevent them from that.
> Really, the whole thing is just a cookie that kernel maintains for memory
> mappings so that two parties can understand what the meaning of that
> mapping is from a higher level. They both have to agree on the naming
> but the kernel shouldn't dictate any specific convention because the
> kernel _doesn't_ _care_. These things are not really anything actionable
> for the kernel. It is just a metadata.
The desire is for one of these two parties to be a human who can get
the data and use it as is without additional conversions.
/proc/$pid/maps could report FD numbers instead of pathnames, which
could be converted to pathnames in userspace. However we do not do
that because pathnames are more convenient for humans to identify a
specific resource. Same logic applies here IMHO.
>
> --
> Michal Hocko
> SUSE Labs
On 10/7/21 11:50, Suren Baghdasaryan wrote:
...
>>>>>>>>>> I believe Pavel meant something as simple as
>>>>>>>>>> $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
>>>>>>>>>> $ touch $YOUR_FILE
>>>>>>>>>> $ stat -c %i $YOUR_FILE
>>>>>>>
>>>>>>> Ah, ok, now I understand the proposal. Thanks for the clarification!
>>>>>>> So, this would use filesystem as a directory for inode->name mappings.
>>>>>>> One rough edge for me is that the consumer would still need to parse
>>>>>>> /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
>>>>>>> just dumping the content for the user. Would it be acceptable if we
>>>>>>> require the ID provided by prctl() to always be a valid inode and
>>>>>>> show_map_vma() would do the inode-to-filename conversion when
>>>>>>> generating maps/smaps files? I know that inode->dentry is not
>>>>>>> one-to-one mapping but we can simply output the first dentry name.
>>>>>>> WDYT?
>>>>>>
>>>>>> No. You do not want to dictate any particular way of the mapping. The
>>>>>> above is just one way to do that without developing any actual mapping
>>>>>> yourself. You just use a filesystem for that. Kernel doesn't and
>>>>>> shouldn't understand the meaning of those numbers. It has no business in
>>>>>> that.
>>>>>>
>>>>>> In a way this would be pushing policy into the kernel.
>>>>>
>>>>> I can see your point. Any other ideas on how to prevent tools from
>>>>> doing this id-to-name conversion themselves?
>>>>
>>>> I really fail to understand why you really want to prevent them from that.
>>>> Really, the whole thing is just a cookie that kernel maintains for memory
>>>> mappings so that two parties can understand what the meaning of that
>>>> mapping is from a higher level. They both have to agree on the naming
>>>> but the kernel shouldn't dictate any specific convention because the
>>>> kernel _doesn't_ _care_. These things are not really anything actionable
>>>> for the kernel. It is just a metadata.
>>>
>>> The desire is for one of these two parties to be a human who can get
>>> the data and use it as is without additional conversions.
>>> /proc/$pid/maps could report FD numbers instead of pathnames, which
>>> could be converted to pathnames in userspace. However we do not do
>>> that because pathnames are more convenient for humans to identify a
>>> specific resource. Same logic applies here IMHO.
>>
>> Yes, please. It really seems like the folks that are interested in this
>> feature want strings. (I certainly do.) For those not interested in the
>> feature, it sounds like a CONFIG to keep it away would be sufficient.
>> Can we just move forward with that?
>
> Would love to if others are ok with this.
>
If this doesn't get accepted, then another way forward would to continue
the ideas above to their logical conclusion, and create a new file system:
vma-fs. Like debug-fs and other special file systems, similar policy and
motivation. Also protected by a CONFIG option.
Actually this seems at least as natural as the procfs approach, especially
given the nature of these strings, which feel more like dir+file names, than
simple strings.
thanks,
--
John Hubbard
NVIDIA
On Thu, Oct 7, 2021 at 12:59 AM Michal Hocko <[email protected]> wrote:
>
> On Wed 06-10-21 08:01:56, Suren Baghdasaryan wrote:
> > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> > >
> > > On 06.10.21 10:27, Michal Hocko wrote:
> > > > On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > > > [...]
> > > >> 1) Yes, just leave the strings in the kernel, that's simple and
> > > >> it works, and the alternatives don't really help your case nearly
> > > >> enough.
> > > >
> > > > I do not have a strong opinion. Strings are easier to use but they
> > > > are more involved and the necessity of kref approach just underlines
> > > > that. There are going to be new allocations and that always can lead
> > > > to surprising side effects. These are small (80B at maximum) so the
> > > > overall footpring shouldn't all that large by default but it can grow
> > > > quite large with a very high max_map_count. There are workloads which
> > > > really require the default to be set high (e.g. heavy mremap users). So
> > > > if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > > >
> > > > I do agree that numbers are just much more simpler from accounting,
> > > > performance and implementation POV.
> > >
> > > +1
> > >
> > > I can understand that having a string can be quite beneficial e.g., when
> > > dumping mmaps. If only user space knows the id <-> string mapping, that
> > > can be quite tricky.
> > >
> > > However, I also do wonder if there would be a way to standardize/reserve
> > > ids, such that a given id always corresponds to a specific user. If we
> > > use an uint64_t for an id, there would be plenty room to reserve ids ...
> > >
> > > I'd really prefer if we can avoid using strings and instead using ids.
> >
> > I wish it was that simple and for some names like [anon:.bss] or
> > [anon:dalvik-zygote space] reserving a unique id would work, however
> > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > are generated dynamically at runtime and include package name.
> > Packages are constantly evolving, new ones are developed, names can
> > change, etc. So assigning a unique id for these names is not really
> > feasible.
>
> I still do not follow. If you need a globaly consistent naming then
> you need clear rules for that, no matter whether that is number or a
> file. How do you handle this with strings currently?
Some names represent standard categories, some are unique. A simple
tool could calculate and report the total for each name, a more
advanced tool might recognize some standard names and process them
differently. From kernel's POV, it's just a name used by the userspace
to categorize anonymous memory areas.
>
> --
> Michal Hocko
> SUSE Labs
On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> >
> > Hi!
> >
> > > >> Hmm, so the suggestion is to have some directory which contains files
> > > >> representing IDs, each containing the string name of the associated
> > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > >> would have to scan that directory, check all files and see if any of
> > > >> them contain the name we want to reuse the same ID.
> > > >
> > > > I believe Pavel meant something as simple as
> > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > $ touch $YOUR_FILE
> > > > $ stat -c %i $YOUR_FILE
>
> Ah, ok, now I understand the proposal. Thanks for the clarification!
> So, this would use filesystem as a directory for inode->name mappings.
> One rough edge for me is that the consumer would still need to parse
> /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> just dumping the content for the user. Would it be acceptable if we
> require the ID provided by prctl() to always be a valid inode and
> show_map_vma() would do the inode-to-filename conversion when
> generating maps/smaps files? I know that inode->dentry is not
> one-to-one mapping but we can simply output the first dentry name.
> WDYT?
No. You do not want to dictate any particular way of the mapping. The
above is just one way to do that without developing any actual mapping
yourself. You just use a filesystem for that. Kernel doesn't and
shouldn't understand the meaning of those numbers. It has no business in
that.
In a way this would be pushing policy into the kernel.
--
Michal Hocko
SUSE Labs
On Thu 07-10-21 08:45:21, Suren Baghdasaryan wrote:
> On Thu, Oct 7, 2021 at 12:59 AM Michal Hocko <[email protected]> wrote:
> >
> > On Wed 06-10-21 08:01:56, Suren Baghdasaryan wrote:
> > > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> > > >
> > > > On 06.10.21 10:27, Michal Hocko wrote:
> > > > > On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > > > > [...]
> > > > >> 1) Yes, just leave the strings in the kernel, that's simple and
> > > > >> it works, and the alternatives don't really help your case nearly
> > > > >> enough.
> > > > >
> > > > > I do not have a strong opinion. Strings are easier to use but they
> > > > > are more involved and the necessity of kref approach just underlines
> > > > > that. There are going to be new allocations and that always can lead
> > > > > to surprising side effects. These are small (80B at maximum) so the
> > > > > overall footpring shouldn't all that large by default but it can grow
> > > > > quite large with a very high max_map_count. There are workloads which
> > > > > really require the default to be set high (e.g. heavy mremap users). So
> > > > > if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > > > >
> > > > > I do agree that numbers are just much more simpler from accounting,
> > > > > performance and implementation POV.
> > > >
> > > > +1
> > > >
> > > > I can understand that having a string can be quite beneficial e.g., when
> > > > dumping mmaps. If only user space knows the id <-> string mapping, that
> > > > can be quite tricky.
> > > >
> > > > However, I also do wonder if there would be a way to standardize/reserve
> > > > ids, such that a given id always corresponds to a specific user. If we
> > > > use an uint64_t for an id, there would be plenty room to reserve ids ...
> > > >
> > > > I'd really prefer if we can avoid using strings and instead using ids.
> > >
> > > I wish it was that simple and for some names like [anon:.bss] or
> > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > are generated dynamically at runtime and include package name.
> > > Packages are constantly evolving, new ones are developed, names can
> > > change, etc. So assigning a unique id for these names is not really
> > > feasible.
> >
> > I still do not follow. If you need a globaly consistent naming then
> > you need clear rules for that, no matter whether that is number or a
> > file. How do you handle this with strings currently?
>
> Some names represent standard categories, some are unique. A simple
> tool could calculate and report the total for each name, a more
> advanced tool might recognize some standard names and process them
> differently. From kernel's POV, it's just a name used by the userspace
> to categorize anonymous memory areas.
OK, so there is no real authority or any real naming convention. You
just hope that applications will behave so that the consumer of those
names can make proper calls. Correct?
In that case the same applies to numbers and I do not see any strong
argument for strings other than it is more pleasing to a human eye when
reading the file. And that doesn't sound like a strong argument to make
the kernel more complicated. Functionally both approaches are equal from
a practical POV.
--
Michal Hocko
SUSE Labs
On Thu 07-10-21 09:43:14, Suren Baghdasaryan wrote:
> On Thu, Oct 7, 2021 at 9:37 AM Michal Hocko <[email protected]> wrote:
[...]
> > OK, so there is no real authority or any real naming convention. You
> > just hope that applications will behave so that the consumer of those
> > names can make proper calls. Correct?
> >
> > In that case the same applies to numbers and I do not see any strong
> > argument for strings other than it is more pleasing to a human eye when
> > reading the file. And that doesn't sound like a strong argument to make
> > the kernel more complicated. Functionally both approaches are equal from
> > a practical POV.
>
> I don't think that's correct. Names like [anon:.bss],
> [anon:dalvik-zygote space] and
> [anon:dalvik-/system/framework/boot-core-icu4j.art] provide user with
> actionable information about the use of that memory or the allocator
> using it.
No, none of the above is really actionable without a common
understanding. Both dalvik* are a complete gibberish to me.
--
Michal Hocko
SUSE Labs
On Thu, Oct 7, 2021 at 10:25 AM 'Michal Hocko' via kernel-team
<[email protected]> wrote:
>
> On Thu 07-10-21 09:43:14, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 9:37 AM Michal Hocko <[email protected]> wrote:
> [...]
> > > OK, so there is no real authority or any real naming convention. You
> > > just hope that applications will behave so that the consumer of those
> > > names can make proper calls. Correct?
> > >
> > > In that case the same applies to numbers and I do not see any strong
> > > argument for strings other than it is more pleasing to a human eye when
> > > reading the file. And that doesn't sound like a strong argument to make
> > > the kernel more complicated. Functionally both approaches are equal from
> > > a practical POV.
> >
> > I don't think that's correct. Names like [anon:.bss],
> > [anon:dalvik-zygote space] and
> > [anon:dalvik-/system/framework/boot-core-icu4j.art] provide user with
> > actionable information about the use of that memory or the allocator
> > using it.
>
> No, none of the above is really actionable without a common
> understanding. Both dalvik* are a complete gibberish to me.
Ok, maybe I was unclear. Some names, as the first two in the above
example are quite standard for Android and tools do use them to
identify specific specialized areas. Some names are not standardized
and can contain package names, like
anon:dalvik-/system/framework/boot-core-icu4j.art. In this case while
tools do not process them in any special way, they still convey enough
information for a user to identify the corresponding component.
> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Thu 07-10-21 09:58:02, Suren Baghdasaryan wrote:
> On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
> >
> > On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > > >
> > > > Hi!
> > > >
> > > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > > >> representing IDs, each containing the string name of the associated
> > > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > > >> would have to scan that directory, check all files and see if any of
> > > > > >> them contain the name we want to reuse the same ID.
> > > > > >
> > > > > > I believe Pavel meant something as simple as
> > > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > > $ touch $YOUR_FILE
> > > > > > $ stat -c %i $YOUR_FILE
> > >
> > > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > > So, this would use filesystem as a directory for inode->name mappings.
> > > One rough edge for me is that the consumer would still need to parse
> > > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > > just dumping the content for the user. Would it be acceptable if we
> > > require the ID provided by prctl() to always be a valid inode and
> > > show_map_vma() would do the inode-to-filename conversion when
> > > generating maps/smaps files? I know that inode->dentry is not
> > > one-to-one mapping but we can simply output the first dentry name.
> > > WDYT?
> >
> > No. You do not want to dictate any particular way of the mapping. The
> > above is just one way to do that without developing any actual mapping
> > yourself. You just use a filesystem for that. Kernel doesn't and
> > shouldn't understand the meaning of those numbers. It has no business in
> > that.
> >
> > In a way this would be pushing policy into the kernel.
>
> I can see your point. Any other ideas on how to prevent tools from
> doing this id-to-name conversion themselves?
I really fail to understand why you really want to prevent them from that.
Really, the whole thing is just a cookie that kernel maintains for memory
mappings so that two parties can understand what the meaning of that
mapping is from a higher level. They both have to agree on the naming
but the kernel shouldn't dictate any specific convention because the
kernel _doesn't_ _care_. These things are not really anything actionable
for the kernel. It is just a metadata.
--
Michal Hocko
SUSE Labs
On Thu, Oct 07, 2021 at 10:50:24AM -0700, Suren Baghdasaryan wrote:
> On Thu, Oct 7, 2021 at 10:31 AM Michal Hocko <[email protected]> wrote:
> >
> > On Thu 07-10-21 09:58:02, Suren Baghdasaryan wrote:
> > > On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > > > > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > > > > >
> > > > > > Hi!
> > > > > >
> > > > > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > > > > >> representing IDs, each containing the string name of the associated
> > > > > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > > > > >> would have to scan that directory, check all files and see if any of
> > > > > > > >> them contain the name we want to reuse the same ID.
> > > > > > > >
> > > > > > > > I believe Pavel meant something as simple as
> > > > > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > > > > $ touch $YOUR_FILE
> > > > > > > > $ stat -c %i $YOUR_FILE
> > > > >
> > > > > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > > > > So, this would use filesystem as a directory for inode->name mappings.
> > > > > One rough edge for me is that the consumer would still need to parse
> > > > > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > > > > just dumping the content for the user. Would it be acceptable if we
> > > > > require the ID provided by prctl() to always be a valid inode and
> > > > > show_map_vma() would do the inode-to-filename conversion when
> > > > > generating maps/smaps files? I know that inode->dentry is not
> > > > > one-to-one mapping but we can simply output the first dentry name.
> > > > > WDYT?
> > > >
> > > > No. You do not want to dictate any particular way of the mapping. The
> > > > above is just one way to do that without developing any actual mapping
> > > > yourself. You just use a filesystem for that. Kernel doesn't and
> > > > shouldn't understand the meaning of those numbers. It has no business in
> > > > that.
> > > >
> > > > In a way this would be pushing policy into the kernel.
> > >
> > > I can see your point. Any other ideas on how to prevent tools from
> > > doing this id-to-name conversion themselves?
> >
> > I really fail to understand why you really want to prevent them from that.
> > Really, the whole thing is just a cookie that kernel maintains for memory
> > mappings so that two parties can understand what the meaning of that
> > mapping is from a higher level. They both have to agree on the naming
> > but the kernel shouldn't dictate any specific convention because the
> > kernel _doesn't_ _care_. These things are not really anything actionable
> > for the kernel. It is just a metadata.
>
> The desire is for one of these two parties to be a human who can get
> the data and use it as is without additional conversions.
> /proc/$pid/maps could report FD numbers instead of pathnames, which
> could be converted to pathnames in userspace. However we do not do
> that because pathnames are more convenient for humans to identify a
> specific resource. Same logic applies here IMHO.
Yes, please. It really seems like the folks that are interested in this
feature want strings. (I certainly do.) For those not interested in the
feature, it sounds like a CONFIG to keep it away would be sufficient.
Can we just move forward with that?
--
Kees Cook
On Thu, Oct 7, 2021 at 11:13 AM Kees Cook <[email protected]> wrote:
>
> On Thu, Oct 07, 2021 at 10:50:24AM -0700, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 10:31 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Thu 07-10-21 09:58:02, Suren Baghdasaryan wrote:
> > > > On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > > > > > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > > > > > >
> > > > > > > Hi!
> > > > > > >
> > > > > > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > > > > > >> representing IDs, each containing the string name of the associated
> > > > > > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > > > > > >> would have to scan that directory, check all files and see if any of
> > > > > > > > >> them contain the name we want to reuse the same ID.
> > > > > > > > >
> > > > > > > > > I believe Pavel meant something as simple as
> > > > > > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > > > > > $ touch $YOUR_FILE
> > > > > > > > > $ stat -c %i $YOUR_FILE
> > > > > >
> > > > > > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > > > > > So, this would use filesystem as a directory for inode->name mappings.
> > > > > > One rough edge for me is that the consumer would still need to parse
> > > > > > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > > > > > just dumping the content for the user. Would it be acceptable if we
> > > > > > require the ID provided by prctl() to always be a valid inode and
> > > > > > show_map_vma() would do the inode-to-filename conversion when
> > > > > > generating maps/smaps files? I know that inode->dentry is not
> > > > > > one-to-one mapping but we can simply output the first dentry name.
> > > > > > WDYT?
> > > > >
> > > > > No. You do not want to dictate any particular way of the mapping. The
> > > > > above is just one way to do that without developing any actual mapping
> > > > > yourself. You just use a filesystem for that. Kernel doesn't and
> > > > > shouldn't understand the meaning of those numbers. It has no business in
> > > > > that.
> > > > >
> > > > > In a way this would be pushing policy into the kernel.
> > > >
> > > > I can see your point. Any other ideas on how to prevent tools from
> > > > doing this id-to-name conversion themselves?
> > >
> > > I really fail to understand why you really want to prevent them from that.
> > > Really, the whole thing is just a cookie that kernel maintains for memory
> > > mappings so that two parties can understand what the meaning of that
> > > mapping is from a higher level. They both have to agree on the naming
> > > but the kernel shouldn't dictate any specific convention because the
> > > kernel _doesn't_ _care_. These things are not really anything actionable
> > > for the kernel. It is just a metadata.
> >
> > The desire is for one of these two parties to be a human who can get
> > the data and use it as is without additional conversions.
> > /proc/$pid/maps could report FD numbers instead of pathnames, which
> > could be converted to pathnames in userspace. However we do not do
> > that because pathnames are more convenient for humans to identify a
> > specific resource. Same logic applies here IMHO.
>
> Yes, please. It really seems like the folks that are interested in this
> feature want strings. (I certainly do.) For those not interested in the
> feature, it sounds like a CONFIG to keep it away would be sufficient.
> Can we just move forward with that?
Would love to if others are ok with this.
>
> --
> Kees Cook
On Thu, Oct 7, 2021 at 9:37 AM Michal Hocko <[email protected]> wrote:
>
> On Thu 07-10-21 08:45:21, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 12:59 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Wed 06-10-21 08:01:56, Suren Baghdasaryan wrote:
> > > > On Wed, Oct 6, 2021 at 2:27 AM David Hildenbrand <[email protected]> wrote:
> > > > >
> > > > > On 06.10.21 10:27, Michal Hocko wrote:
> > > > > > On Tue 05-10-21 23:57:36, John Hubbard wrote:
> > > > > > [...]
> > > > > >> 1) Yes, just leave the strings in the kernel, that's simple and
> > > > > >> it works, and the alternatives don't really help your case nearly
> > > > > >> enough.
> > > > > >
> > > > > > I do not have a strong opinion. Strings are easier to use but they
> > > > > > are more involved and the necessity of kref approach just underlines
> > > > > > that. There are going to be new allocations and that always can lead
> > > > > > to surprising side effects. These are small (80B at maximum) so the
> > > > > > overall footpring shouldn't all that large by default but it can grow
> > > > > > quite large with a very high max_map_count. There are workloads which
> > > > > > really require the default to be set high (e.g. heavy mremap users). So
> > > > > > if anything all those should be __GFP_ACCOUNT and memcg accounted.
> > > > > >
> > > > > > I do agree that numbers are just much more simpler from accounting,
> > > > > > performance and implementation POV.
> > > > >
> > > > > +1
> > > > >
> > > > > I can understand that having a string can be quite beneficial e.g., when
> > > > > dumping mmaps. If only user space knows the id <-> string mapping, that
> > > > > can be quite tricky.
> > > > >
> > > > > However, I also do wonder if there would be a way to standardize/reserve
> > > > > ids, such that a given id always corresponds to a specific user. If we
> > > > > use an uint64_t for an id, there would be plenty room to reserve ids ...
> > > > >
> > > > > I'd really prefer if we can avoid using strings and instead using ids.
> > > >
> > > > I wish it was that simple and for some names like [anon:.bss] or
> > > > [anon:dalvik-zygote space] reserving a unique id would work, however
> > > > some names like [anon:dalvik-/system/framework/boot-core-icu4j.art]
> > > > are generated dynamically at runtime and include package name.
> > > > Packages are constantly evolving, new ones are developed, names can
> > > > change, etc. So assigning a unique id for these names is not really
> > > > feasible.
> > >
> > > I still do not follow. If you need a globaly consistent naming then
> > > you need clear rules for that, no matter whether that is number or a
> > > file. How do you handle this with strings currently?
> >
> > Some names represent standard categories, some are unique. A simple
> > tool could calculate and report the total for each name, a more
> > advanced tool might recognize some standard names and process them
> > differently. From kernel's POV, it's just a name used by the userspace
> > to categorize anonymous memory areas.
>
> OK, so there is no real authority or any real naming convention. You
> just hope that applications will behave so that the consumer of those
> names can make proper calls. Correct?
>
> In that case the same applies to numbers and I do not see any strong
> argument for strings other than it is more pleasing to a human eye when
> reading the file. And that doesn't sound like a strong argument to make
> the kernel more complicated. Functionally both approaches are equal from
> a practical POV.
I don't think that's correct. Names like [anon:.bss],
[anon:dalvik-zygote space] and
[anon:dalvik-/system/framework/boot-core-icu4j.art] provide user with
actionable information about the use of that memory or the allocator
using it. Names like [anon:1], [anon:2] and [anon:3] do not convey any
valuable information for the user until they are converted into
descriptive names.
> --
> Michal Hocko
> SUSE Labs
On Thu, Oct 7, 2021 at 12:03 PM 'John Hubbard' via kernel-team
<[email protected]> wrote:
>
> On 10/7/21 11:50, Suren Baghdasaryan wrote:
> ...
> >>>>>>>>>> I believe Pavel meant something as simple as
> >>>>>>>>>> $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> >>>>>>>>>> $ touch $YOUR_FILE
> >>>>>>>>>> $ stat -c %i $YOUR_FILE
> >>>>>>>
> >>>>>>> Ah, ok, now I understand the proposal. Thanks for the clarification!
> >>>>>>> So, this would use filesystem as a directory for inode->name mappings.
> >>>>>>> One rough edge for me is that the consumer would still need to parse
> >>>>>>> /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> >>>>>>> just dumping the content for the user. Would it be acceptable if we
> >>>>>>> require the ID provided by prctl() to always be a valid inode and
> >>>>>>> show_map_vma() would do the inode-to-filename conversion when
> >>>>>>> generating maps/smaps files? I know that inode->dentry is not
> >>>>>>> one-to-one mapping but we can simply output the first dentry name.
> >>>>>>> WDYT?
> >>>>>>
> >>>>>> No. You do not want to dictate any particular way of the mapping. The
> >>>>>> above is just one way to do that without developing any actual mapping
> >>>>>> yourself. You just use a filesystem for that. Kernel doesn't and
> >>>>>> shouldn't understand the meaning of those numbers. It has no business in
> >>>>>> that.
> >>>>>>
> >>>>>> In a way this would be pushing policy into the kernel.
> >>>>>
> >>>>> I can see your point. Any other ideas on how to prevent tools from
> >>>>> doing this id-to-name conversion themselves?
> >>>>
> >>>> I really fail to understand why you really want to prevent them from that.
> >>>> Really, the whole thing is just a cookie that kernel maintains for memory
> >>>> mappings so that two parties can understand what the meaning of that
> >>>> mapping is from a higher level. They both have to agree on the naming
> >>>> but the kernel shouldn't dictate any specific convention because the
> >>>> kernel _doesn't_ _care_. These things are not really anything actionable
> >>>> for the kernel. It is just a metadata.
> >>>
> >>> The desire is for one of these two parties to be a human who can get
> >>> the data and use it as is without additional conversions.
> >>> /proc/$pid/maps could report FD numbers instead of pathnames, which
> >>> could be converted to pathnames in userspace. However we do not do
> >>> that because pathnames are more convenient for humans to identify a
> >>> specific resource. Same logic applies here IMHO.
> >>
> >> Yes, please. It really seems like the folks that are interested in this
> >> feature want strings. (I certainly do.) For those not interested in the
> >> feature, it sounds like a CONFIG to keep it away would be sufficient.
> >> Can we just move forward with that?
> >
> > Would love to if others are ok with this.
> >
>
> If this doesn't get accepted, then another way forward would to continue
> the ideas above to their logical conclusion, and create a new file system:
> vma-fs. Like debug-fs and other special file systems, similar policy and
> motivation. Also protected by a CONFIG option.
TBH, I would prefer to have the current simple solution protected with
a CONFIG option.
>
> Actually this seems at least as natural as the procfs approach, especially
> given the nature of these strings, which feel more like dir+file names, than
> simple strings.
>
> thanks,
> --
> John Hubbard
> NVIDIA
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
* Suren Baghdasaryan <[email protected]> [211007 17:32]:
> On Thu, Oct 7, 2021 at 12:03 PM 'John Hubbard' via kernel-team
> <[email protected]> wrote:
> >
> > On 10/7/21 11:50, Suren Baghdasaryan wrote:
> > ...
> > >>>>>>>>>> I believe Pavel meant something as simple as
> > >>>>>>>>>> $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > >>>>>>>>>> $ touch $YOUR_FILE
> > >>>>>>>>>> $ stat -c %i $YOUR_FILE
> > >>>>>>>
> > >>>>>>> Ah, ok, now I understand the proposal. Thanks for the clarification!
> > >>>>>>> So, this would use filesystem as a directory for inode->name mappings.
> > >>>>>>> One rough edge for me is that the consumer would still need to parse
> > >>>>>>> /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > >>>>>>> just dumping the content for the user. Would it be acceptable if we
> > >>>>>>> require the ID provided by prctl() to always be a valid inode and
> > >>>>>>> show_map_vma() would do the inode-to-filename conversion when
> > >>>>>>> generating maps/smaps files? I know that inode->dentry is not
> > >>>>>>> one-to-one mapping but we can simply output the first dentry name.
> > >>>>>>> WDYT?
> > >>>>>>
> > >>>>>> No. You do not want to dictate any particular way of the mapping. The
> > >>>>>> above is just one way to do that without developing any actual mapping
> > >>>>>> yourself. You just use a filesystem for that. Kernel doesn't and
> > >>>>>> shouldn't understand the meaning of those numbers. It has no business in
> > >>>>>> that.
> > >>>>>>
> > >>>>>> In a way this would be pushing policy into the kernel.
> > >>>>>
> > >>>>> I can see your point. Any other ideas on how to prevent tools from
> > >>>>> doing this id-to-name conversion themselves?
> > >>>>
> > >>>> I really fail to understand why you really want to prevent them from that.
> > >>>> Really, the whole thing is just a cookie that kernel maintains for memory
> > >>>> mappings so that two parties can understand what the meaning of that
> > >>>> mapping is from a higher level. They both have to agree on the naming
> > >>>> but the kernel shouldn't dictate any specific convention because the
> > >>>> kernel _doesn't_ _care_. These things are not really anything actionable
> > >>>> for the kernel. It is just a metadata.
> > >>>
> > >>> The desire is for one of these two parties to be a human who can get
> > >>> the data and use it as is without additional conversions.
> > >>> /proc/$pid/maps could report FD numbers instead of pathnames, which
> > >>> could be converted to pathnames in userspace. However we do not do
> > >>> that because pathnames are more convenient for humans to identify a
> > >>> specific resource. Same logic applies here IMHO.
> > >>
> > >> Yes, please. It really seems like the folks that are interested in this
> > >> feature want strings. (I certainly do.) For those not interested in the
> > >> feature, it sounds like a CONFIG to keep it away would be sufficient.
> > >> Can we just move forward with that?
> > >
> > > Would love to if others are ok with this.
> > >
> >
> > If this doesn't get accepted, then another way forward would to continue
> > the ideas above to their logical conclusion, and create a new file system:
> > vma-fs. Like debug-fs and other special file systems, similar policy and
> > motivation. Also protected by a CONFIG option.
>
> TBH, I would prefer to have the current simple solution protected with
> a CONFIG option.
>
> >
> > Actually this seems at least as natural as the procfs approach, especially
> > given the nature of these strings, which feel more like dir+file names, than
> > simple strings.
I think the current locking around VMAs makes this a very tricky, if not
impossible, path. Watching a proc file which takes the mmap_lock() is
painful enough. Considering how hard it has been to have this feature
added, I cannot see locking changes being accepted as a more feasible
approach nor can I see increased mmap_lock() contention from any feature
being desired.
I like the CONFIG option. The patches are in good shape and have a
clever way around the (unlikely?) scalability issue that existed.
Thanks,
Liam
On Thu 07-10-21 11:12:58, Kees Cook wrote:
> On Thu, Oct 07, 2021 at 10:50:24AM -0700, Suren Baghdasaryan wrote:
> > On Thu, Oct 7, 2021 at 10:31 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Thu 07-10-21 09:58:02, Suren Baghdasaryan wrote:
> > > > On Thu, Oct 7, 2021 at 9:40 AM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Thu 07-10-21 09:04:09, Suren Baghdasaryan wrote:
> > > > > > On Thu, Oct 7, 2021 at 3:15 AM Pavel Machek <[email protected]> wrote:
> > > > > > >
> > > > > > > Hi!
> > > > > > >
> > > > > > > > >> Hmm, so the suggestion is to have some directory which contains files
> > > > > > > > >> representing IDs, each containing the string name of the associated
> > > > > > > > >> vma? Then let's say we are creating a new VMA and want to name it. We
> > > > > > > > >> would have to scan that directory, check all files and see if any of
> > > > > > > > >> them contain the name we want to reuse the same ID.
> > > > > > > > >
> > > > > > > > > I believe Pavel meant something as simple as
> > > > > > > > > $ YOUR_FILE=$YOUR_IDS_DIR/my_string_name
> > > > > > > > > $ touch $YOUR_FILE
> > > > > > > > > $ stat -c %i $YOUR_FILE
> > > > > >
> > > > > > Ah, ok, now I understand the proposal. Thanks for the clarification!
> > > > > > So, this would use filesystem as a directory for inode->name mappings.
> > > > > > One rough edge for me is that the consumer would still need to parse
> > > > > > /proc/$pid/maps and convert [anon:inode] into [anon:name] instead of
> > > > > > just dumping the content for the user. Would it be acceptable if we
> > > > > > require the ID provided by prctl() to always be a valid inode and
> > > > > > show_map_vma() would do the inode-to-filename conversion when
> > > > > > generating maps/smaps files? I know that inode->dentry is not
> > > > > > one-to-one mapping but we can simply output the first dentry name.
> > > > > > WDYT?
> > > > >
> > > > > No. You do not want to dictate any particular way of the mapping. The
> > > > > above is just one way to do that without developing any actual mapping
> > > > > yourself. You just use a filesystem for that. Kernel doesn't and
> > > > > shouldn't understand the meaning of those numbers. It has no business in
> > > > > that.
> > > > >
> > > > > In a way this would be pushing policy into the kernel.
> > > >
> > > > I can see your point. Any other ideas on how to prevent tools from
> > > > doing this id-to-name conversion themselves?
> > >
> > > I really fail to understand why you really want to prevent them from that.
> > > Really, the whole thing is just a cookie that kernel maintains for memory
> > > mappings so that two parties can understand what the meaning of that
> > > mapping is from a higher level. They both have to agree on the naming
> > > but the kernel shouldn't dictate any specific convention because the
> > > kernel _doesn't_ _care_. These things are not really anything actionable
> > > for the kernel. It is just a metadata.
> >
> > The desire is for one of these two parties to be a human who can get
> > the data and use it as is without additional conversions.
> > /proc/$pid/maps could report FD numbers instead of pathnames, which
> > could be converted to pathnames in userspace. However we do not do
> > that because pathnames are more convenient for humans to identify a
> > specific resource. Same logic applies here IMHO.
>
> Yes, please. It really seems like the folks that are interested in this
> feature want strings. (I certainly do.)
I am sorry but there were no strong arguments mentioned for strings so
far. Effectively string require a more complex and more resource hungry
solution. The only advantage is that strings are nicer to read for
humans.
There hasn't been any plan presented for actual naming convention or how
those names would be used in practice. Except for a more advanced
resource management and that sounds like something that can work with
ids just fine.
> For those not interested in the
> feature, it sounds like a CONFIG to keep it away would be sufficient.
CONFIG is not an answer here as already pointed out. Distro kernels will
be forced to enable this because there might be somebody to use this
feature.
Initially I was not really feeling strongly one way or other but more we
are discussing the topic the more I see that strings have a very weak
justification behind.
--
Michal Hocko
SUSE Labs
On 07/10/2021 21.02, John Hubbard wrote:
> On 10/7/21 11:50, Suren Baghdasaryan wrote:
> ...
>>>> The desire is for one of these two parties to be a human who can get
>>>> the data and use it as is without additional conversions.
>>>> /proc/$pid/maps could report FD numbers instead of pathnames, which
>>>> could be converted to pathnames in userspace. However we do not do
>>>> that because pathnames are more convenient for humans to identify a
>>>> specific resource. Same logic applies here IMHO.
>>>
>>> Yes, please. It really seems like the folks that are interested in this
>>> feature want strings. (I certainly do.) For those not interested in the
>>> feature, it sounds like a CONFIG to keep it away would be sufficient.
>>> Can we just move forward with that?
>>
>> Would love to if others are ok with this.
>>
>
> If this doesn't get accepted, then another way forward would to continue
> the ideas above to their logical conclusion, and create a new file system:
> vma-fs.
Or: Why can't the library/application that wants a VMA backed by memory
to have some associated name not just
fd = open("/run/named-vmas/foobar#24", O_CLOEXEC|O_RDWR|O_EXCL|O_CREAT);
unlink("/run/named-vmas/foobar#24");
ftruncate(fd, ...);
mmap(fd);
where /run/named-vmas is a tmpfs (probably with some per-user/per-app
subdirs). That requires no changes in the kernel at all. Yes, it lacks
the automatic cleanup of using real anon mmap in case there's a crash
between open and unlink, but in an environment like Android that should
be solvable.
Rasmus
On 08.10.21 09:25, Rasmus Villemoes wrote:
> On 07/10/2021 21.02, John Hubbard wrote:
>> On 10/7/21 11:50, Suren Baghdasaryan wrote:
>> ...
>
>>>>> The desire is for one of these two parties to be a human who can get
>>>>> the data and use it as is without additional conversions.
>>>>> /proc/$pid/maps could report FD numbers instead of pathnames, which
>>>>> could be converted to pathnames in userspace. However we do not do
>>>>> that because pathnames are more convenient for humans to identify a
>>>>> specific resource. Same logic applies here IMHO.
>>>>
>>>> Yes, please. It really seems like the folks that are interested in this
>>>> feature want strings. (I certainly do.) For those not interested in the
>>>> feature, it sounds like a CONFIG to keep it away would be sufficient.
>>>> Can we just move forward with that?
>>>
>>> Would love to if others are ok with this.
>>>
>>
>> If this doesn't get accepted, then another way forward would to continue
>> the ideas above to their logical conclusion, and create a new file system:
>> vma-fs.
>
> Or: Why can't the library/application that wants a VMA backed by memory
> to have some associated name not just
>
> fd = open("/run/named-vmas/foobar#24", O_CLOEXEC|O_RDWR|O_EXCL|O_CREAT);
> unlink("/run/named-vmas/foobar#24");
> ftruncate(fd, ...);
> mmap(fd);
>
> where /run/named-vmas is a tmpfs (probably with some per-user/per-app
> subdirs). That requires no changes in the kernel at all. Yes, it lacks
> the automatic cleanup of using real anon mmap in case there's a crash
> between open and unlink, but in an environment like Android that should
> be solvable.
I'm going to point out that we already do have names for memfds.
"The name supplied in name is used as a filename and will be displayed
as the target of the corresponding symbolic link in the
directory /proc/self/fd/." It's also displayed in /proc/self/maps.
So theoretically, without any kernel changes one might be able to
achieve something as proposed in this patch via memfd_create().
There is one issue to be fixed:
MAP_PRIVATE on memfd results in a double memory consumption on any
access via the mapping last time I checked (one page for pagecache, one
page for private mapping).
--
Thanks,
David / dhildenb
On 10/7/21 11:34 PM, Michal Hocko wrote:
>> Yes, please. It really seems like the folks that are interested in this
>> feature want strings. (I certainly do.)
> I am sorry but there were no strong arguments mentioned for strings so
> far.
The folks who want this have maintained an out-of-tree patch using
strings. They've maintained it for the better part of a decade. I
don't know how widely this shipped in the Android ecosystem, but I
suspect we're talking about billions of devices. Right?
This is a feature that, if accepted into mainline, will get enabled and
used on billions of devices. If we dumb this down to integers, it's not
100% clear that it _will_ get used.
That's a pretty strong argument in my book, even if the contributors
have difficulty articulating exactly why they want strings.
On Fri 08-10-21 07:14:58, Dave Hansen wrote:
> On 10/7/21 11:34 PM, Michal Hocko wrote:
> >> Yes, please. It really seems like the folks that are interested in this
> >> feature want strings. (I certainly do.)
> > I am sorry but there were no strong arguments mentioned for strings so
> > far.
>
> The folks who want this have maintained an out-of-tree patch using
> strings. They've maintained it for the better part of a decade. I
> don't know how widely this shipped in the Android ecosystem, but I
> suspect we're talking about billions of devices. Right?
>
> This is a feature that, if accepted into mainline, will get enabled and
> used on billions of devices. If we dumb this down to integers, it's not
> 100% clear that it _will_ get used.
>
> That's a pretty strong argument in my book, even if the contributors
> have difficulty articulating exactly why they want strings.
I would agree that if integers would make this unusable then this would
be a strong argument. But I haven't really heard any arguments like that
so far. I have heard about IPC overhead and other speculations that do
not seem really convincing. We shouldn't hand wave concerns regarding
the implementation complexity and resource handling just by "somebody
has been using this for decates", right?
Do not get me wrong. This is going to become a user interface and we
will have to maintain it for ever. As such an extra scrutiny has to be
applied.
--
Michal Hocko
SUSE Labs
On Fri, Oct 8, 2021 at 7:57 AM Michal Hocko <[email protected]> wrote:
>
> On Fri 08-10-21 07:14:58, Dave Hansen wrote:
> > On 10/7/21 11:34 PM, Michal Hocko wrote:
> > >> Yes, please. It really seems like the folks that are interested in this
> > >> feature want strings. (I certainly do.)
> > > I am sorry but there were no strong arguments mentioned for strings so
> > > far.
> >
> > The folks who want this have maintained an out-of-tree patch using
> > strings. They've maintained it for the better part of a decade. I
> > don't know how widely this shipped in the Android ecosystem, but I
> > suspect we're talking about billions of devices. Right?
Correct.
> >
> > This is a feature that, if accepted into mainline, will get enabled and
> > used on billions of devices. If we dumb this down to integers, it's not
> > 100% clear that it _will_ get used.
Not as is and not with some major changes in the userspace, which
relied on a simple interface: set a name to a vma, observe that name
in the /proc/$pid/maps.
> >
> > That's a pretty strong argument in my book, even if the contributors
> > have difficulty articulating exactly why they want strings.
>
> I would agree that if integers would make this unusable then this would
> be a strong argument. But I haven't really heard any arguments like that
> so far. I have heard about IPC overhead and other speculations that do
> not seem really convincing. We shouldn't hand wave concerns regarding
> the implementation complexity and resource handling just by "somebody
> has been using this for decates", right?
>
> Do not get me wrong. This is going to become a user interface and we
> will have to maintain it for ever. As such an extra scrutiny has to be
> applied.
I don't know how to better articulate this. IPC transactions on
Android cannot be scheduled efficiently. We're going to have to stall
after mmap, make binder transaction, schedule a new process, get the
ID, make binder reply, schedule back to the original thread, resume.
Doing this potentially for every mmap is a non-starter. Deferring this
job is possible but we still have to do all this work, so it still
requires cpu cycles and power, not mentioning the additional
complexity in the userspace. I'm adding a rep from the performance
team, maybe Tim can explain this better.
There were a couple suggestions on using filesystem/memfd for naming
purposes which I need to explore but if that works the approach will
likely not involve any IDs. We want human-readable names in the maps
file, not a number.
Thanks for all the feedback and ideas!
> --
> Michal Hocko
> SUSE Labs
On Fri, Oct 08, 2021 at 08:34:47AM +0200, Michal Hocko wrote:
> I am sorry but there were no strong arguments mentioned for strings so
> far. Effectively string require a more complex and more resource hungry
> solution. The only advantage is that strings are nicer to read for
> humans.
How I see it:
- Strings are already present in the "maps" output, so this doesn't
create a burden on userspace to grow new parsers.
- Strings for "anon" specifically have no required format (this is good)
it's informational like the task_struct::comm and can (roughly)
anything. There's no naming convention for memfds, AF_UNIX, etc. Why
is one needed here? That seems like a completely unreasonable
requirement.
- Strings need to be in kernel space because cross-process GUP has been a
constant source of security flaws.
> There hasn't been any plan presented for actual naming convention or how
> those names would be used in practice. Except for a more advanced
> resource management and that sounds like something that can work with
> ids just fine.
There doesn't need to be and there shouldn't be. Why aren't memfds names
an id? Because, to quote the man-page, "... serves only for debugging
purposes. Names do not affect the behavior of the file descriptor, and
as such multiple files can have the same name without any side effects."
And they aren't filtered _at all_ either. I think the anonymous vma name
series has gone out of its way to be safe and sane while still providing
the ease-of-use that it was designed to provide.
> Initially I was not really feeling strongly one way or other but more we
> are discussing the topic the more I see that strings have a very weak
> justification behind.
I just don't see any _down_ side to gaining this. There's only resource
utilization when it's used, and the complexity is minimal.
--
Kees Cook
On Fri, Oct 08, 2021 at 09:43:59AM +0200, David Hildenbrand wrote:
> I'm going to point out that we already do have names for memfds.
I just did the same here[1]. :P
> [...] It's also displayed in /proc/self/maps.
I missed that part! /me screams forever
We really need to filter this at creation time. :( At least
seq_file_path() escapes "\n" for it, but not "\r", so humans on a
terminal could get very confused...
$ ./memfd '^M0000000000000000-ffffffffffffffff rwxp 00000000 00:00 0 [stack]' &
[1] 2953833
$ cat /proc/2953833/maps
...
0000000000000000-ffffffffffffffff rwxp 00000000 00:00 0 [stack] (deleted)
...
-Kees
[1] https://lore.kernel.org/lkml/202110081344.FE6A7A82@keescook
--
Kees Cook
On Fri 08-10-21 13:58:01, Kees Cook wrote:
> - Strings for "anon" specifically have no required format (this is good)
> it's informational like the task_struct::comm and can (roughly)
> anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> is one needed here? That seems like a completely unreasonable
> requirement.
I might be misreading the justification for the feature. Patch 2 is
talking about tools that need to understand memeory usage to make
further actions. Also Suren was suggesting "numbering convetion" as an
argument against.
So can we get a clear example how is this being used actually? If this
is just to be used to debug by humans than I can see an argument for
human readable form. If this is, however, meant to be used by tools to
make some actions then the argument for strings is much weaker.
--
Michal Hocko
SUSE Labs
On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> >
> > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > - Strings for "anon" specifically have no required format (this is good)
> > > it's informational like the task_struct::comm and can (roughly)
> > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > is one needed here? That seems like a completely unreasonable
> > > requirement.
> >
> > I might be misreading the justification for the feature. Patch 2 is
> > talking about tools that need to understand memeory usage to make
> > further actions. Also Suren was suggesting "numbering convetion" as an
> > argument against.
> >
> > So can we get a clear example how is this being used actually? If this
> > is just to be used to debug by humans than I can see an argument for
> > human readable form. If this is, however, meant to be used by tools to
> > make some actions then the argument for strings is much weaker.
>
> The simplest usecase is when we notice that a process consumes more
> memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> check which area is contributing to this growth. The names we assign
> to anonymous areas are descriptive enough for a developer to get an
> idea where the increased consumption is coming from and how to proceed
> with their investigation.
> There are of course cases when tools are involved, but the end-user is
> always a human and the final report should contain easily
> understandable data.
>
> IIUC, the main argument here is whether the userspace can provide
> tools to perform the translations between ids and names, with the
> kernel accepting and reporting ids instead of strings. Technically
> it's possible, but to be practical that conversion should be fast
> because we will need to make name->id conversion potentially for each
> mmap. On the consumer side the performance is not as critical, but the
> fact that instead of dumping /proc/$pid/maps we will have to parse the
> file, do id->name conversion and replace all [anon:id] with
> [anon:name] would be an issue when we do that in bulk, for example
> when collecting system-wide data for a bugreport.
>
> I went ahead and implemented the proposed userspace solution involving
> tmpfs as a repository for name->id mapping (more precisely
> filename->inode mapping). Profiling shows that open()+fstat()+close()
> takes:
> - roughly 15 times longer than mmap() with 1000 unique names each
> being reused 50 times.
> - roughly 3 times longer than mmap() with 100 unique names each being
> reused 500 times. This is due to lstat() optimization suggested by
> Rasmus which avoids open() and close().
> For comparison, proposed prctl() takes roughly the same amount of time
> as mmap() and does not depend on the number of unique names.
>
> I'm still evaluating the proposal to use memfds but I'm not sure if
> the issue that David Hildenbrand mentioned about additional memory
> consumed in pagecache (which has to be addressed) is the only one we
> will encounter with this approach. If anyone knows of any potential
> issues with using memfds as named anonymous memory, I would really
> appreciate your feedback before I go too far in that direction.
> Thanks,
> Suren.
Just noticed that timmurray@ was dropped from the last reply. Adding him back.
>
> > --
> > Michal Hocko
> > SUSE Labs
On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
>
> On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > - Strings for "anon" specifically have no required format (this is good)
> > it's informational like the task_struct::comm and can (roughly)
> > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > is one needed here? That seems like a completely unreasonable
> > requirement.
>
> I might be misreading the justification for the feature. Patch 2 is
> talking about tools that need to understand memeory usage to make
> further actions. Also Suren was suggesting "numbering convetion" as an
> argument against.
>
> So can we get a clear example how is this being used actually? If this
> is just to be used to debug by humans than I can see an argument for
> human readable form. If this is, however, meant to be used by tools to
> make some actions then the argument for strings is much weaker.
The simplest usecase is when we notice that a process consumes more
memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
check which area is contributing to this growth. The names we assign
to anonymous areas are descriptive enough for a developer to get an
idea where the increased consumption is coming from and how to proceed
with their investigation.
There are of course cases when tools are involved, but the end-user is
always a human and the final report should contain easily
understandable data.
IIUC, the main argument here is whether the userspace can provide
tools to perform the translations between ids and names, with the
kernel accepting and reporting ids instead of strings. Technically
it's possible, but to be practical that conversion should be fast
because we will need to make name->id conversion potentially for each
mmap. On the consumer side the performance is not as critical, but the
fact that instead of dumping /proc/$pid/maps we will have to parse the
file, do id->name conversion and replace all [anon:id] with
[anon:name] would be an issue when we do that in bulk, for example
when collecting system-wide data for a bugreport.
I went ahead and implemented the proposed userspace solution involving
tmpfs as a repository for name->id mapping (more precisely
filename->inode mapping). Profiling shows that open()+fstat()+close()
takes:
- roughly 15 times longer than mmap() with 1000 unique names each
being reused 50 times.
- roughly 3 times longer than mmap() with 100 unique names each being
reused 500 times. This is due to lstat() optimization suggested by
Rasmus which avoids open() and close().
For comparison, proposed prctl() takes roughly the same amount of time
as mmap() and does not depend on the number of unique names.
I'm still evaluating the proposal to use memfds but I'm not sure if
the issue that David Hildenbrand mentioned about additional memory
consumed in pagecache (which has to be addressed) is the only one we
will encounter with this approach. If anyone knows of any potential
issues with using memfds as named anonymous memory, I would really
appreciate your feedback before I go too far in that direction.
Thanks,
Suren.
> --
> Michal Hocko
> SUSE Labs
On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > - Strings for "anon" specifically have no required format (this is good)
> > > > it's informational like the task_struct::comm and can (roughly)
> > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > is one needed here? That seems like a completely unreasonable
> > > > requirement.
> > >
> > > I might be misreading the justification for the feature. Patch 2 is
> > > talking about tools that need to understand memeory usage to make
> > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > argument against.
> > >
> > > So can we get a clear example how is this being used actually? If this
> > > is just to be used to debug by humans than I can see an argument for
> > > human readable form. If this is, however, meant to be used by tools to
> > > make some actions then the argument for strings is much weaker.
> >
> > The simplest usecase is when we notice that a process consumes more
> > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > check which area is contributing to this growth. The names we assign
> > to anonymous areas are descriptive enough for a developer to get an
> > idea where the increased consumption is coming from and how to proceed
> > with their investigation.
> > There are of course cases when tools are involved, but the end-user is
> > always a human and the final report should contain easily
> > understandable data.
> >
> > IIUC, the main argument here is whether the userspace can provide
> > tools to perform the translations between ids and names, with the
> > kernel accepting and reporting ids instead of strings. Technically
> > it's possible, but to be practical that conversion should be fast
> > because we will need to make name->id conversion potentially for each
> > mmap. On the consumer side the performance is not as critical, but the
> > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > file, do id->name conversion and replace all [anon:id] with
> > [anon:name] would be an issue when we do that in bulk, for example
> > when collecting system-wide data for a bugreport.
Is that something you need to do client-side? Or could the bug tool
upload the userspace-maintained name:ids database alongside the
/proc/pid/maps dump for external processing?
On Mon, Oct 11, 2021 at 8:00 PM Johannes Weiner <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > is one needed here? That seems like a completely unreasonable
> > > > > requirement.
> > > >
> > > > I might be misreading the justification for the feature. Patch 2 is
> > > > talking about tools that need to understand memeory usage to make
> > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > argument against.
> > > >
> > > > So can we get a clear example how is this being used actually? If this
> > > > is just to be used to debug by humans than I can see an argument for
> > > > human readable form. If this is, however, meant to be used by tools to
> > > > make some actions then the argument for strings is much weaker.
> > >
> > > The simplest usecase is when we notice that a process consumes more
> > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > check which area is contributing to this growth. The names we assign
> > > to anonymous areas are descriptive enough for a developer to get an
> > > idea where the increased consumption is coming from and how to proceed
> > > with their investigation.
> > > There are of course cases when tools are involved, but the end-user is
> > > always a human and the final report should contain easily
> > > understandable data.
> > >
> > > IIUC, the main argument here is whether the userspace can provide
> > > tools to perform the translations between ids and names, with the
> > > kernel accepting and reporting ids instead of strings. Technically
> > > it's possible, but to be practical that conversion should be fast
> > > because we will need to make name->id conversion potentially for each
> > > mmap. On the consumer side the performance is not as critical, but the
> > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > file, do id->name conversion and replace all [anon:id] with
> > > [anon:name] would be an issue when we do that in bulk, for example
> > > when collecting system-wide data for a bugreport.
>
> Is that something you need to do client-side? Or could the bug tool
> upload the userspace-maintained name:ids database alongside the
> /proc/pid/maps dump for external processing?
You can generate a bugreport and analyze it locally or submit it as an
attachment to a bug for further analyzes.
Sure, we can attach the id->name conversion table to the bugreport but
either way, some tool would have to post-process it to resolve the
ids. If we are not analyzing the results immediately then that step
can be postponed and I think that's what you mean? If so, then yes,
that is correct.
On Mon 11-10-21 18:20:25, Suren Baghdasaryan wrote:
> On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > >
> > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > - Strings for "anon" specifically have no required format (this is good)
> > > > it's informational like the task_struct::comm and can (roughly)
> > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > is one needed here? That seems like a completely unreasonable
> > > > requirement.
> > >
> > > I might be misreading the justification for the feature. Patch 2 is
> > > talking about tools that need to understand memeory usage to make
> > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > argument against.
> > >
> > > So can we get a clear example how is this being used actually? If this
> > > is just to be used to debug by humans than I can see an argument for
> > > human readable form. If this is, however, meant to be used by tools to
> > > make some actions then the argument for strings is much weaker.
> >
> > The simplest usecase is when we notice that a process consumes more
> > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > check which area is contributing to this growth. The names we assign
> > to anonymous areas are descriptive enough for a developer to get an
> > idea where the increased consumption is coming from and how to proceed
> > with their investigation.
> > There are of course cases when tools are involved, but the end-user is
> > always a human and the final report should contain easily
> > understandable data.
OK, it would have been much more preferable to be explicit about this
main use case from the very beginning. Just to make sure we are at the
same page. Is the primary usecase usage and bug reporting?
My initial understanding was that at userspace managed memory management
could make an educated guess about targeted reclaim (e.g. MADV_{FREE,COLD,PAGEOUT}
for cached data in memory like uncompressed images/data). Such a usecase
would clearly require a standardized id/naming convention to be
application neutral.
> > IIUC, the main argument here is whether the userspace can provide
> > tools to perform the translations between ids and names, with the
> > kernel accepting and reporting ids instead of strings. Technically
> > it's possible, but to be practical that conversion should be fast
> > because we will need to make name->id conversion potentially for each
> > mmap. On the consumer side the performance is not as critical, but the
> > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > file, do id->name conversion and replace all [anon:id] with
> > [anon:name] would be an issue when we do that in bulk, for example
> > when collecting system-wide data for a bugreport.
Whether you use ids or human readable strings you still have to
understand the underlying meaning to make any educated guess. Let me
give you an example. Say I have an application with a memory leak. Right
now I can only tell that it is anonymous memory growing but it is not
clear who uses that anonymous. You are adding a means to tell different
users appart. That is really helpful. Now I know this is an anon
user 1234 or MySuperAnonMemory. Neither of the will not tell me more
without a id/naming convention or reading the code. A convention can be
useful for the most common users (e.g. a specific allocator) but I am
rather dubious there are many more that would be _generally_ recognized
without some understanding of the said application.
Maybe the situation in Android is different because the runtime is more
coupled but is it reasonable to expect any common naming conventions for
general Linux platforms?
I am slightly worried that we have spent way too much time talking
specifics about id->name translation rather than the actual usability
of the token.
--
Michal Hocko
SUSE Labs
> I'm still evaluating the proposal to use memfds but I'm not sure if
> the issue that David Hildenbrand mentioned about additional memory
> consumed in pagecache (which has to be addressed) is the only one we
> will encounter with this approach. If anyone knows of any potential
> issues with using memfds as named anonymous memory, I would really
> appreciate your feedback before I go too far in that direction.
[MAP_PRIVATE memfd only behave that way with 4k, not with huge pages, so
I think it just has to be fixed. It doesn't make any sense to allocate a
page for the pagecache ("populate the file") when accessing via a
private mapping that's supposed to leave the file untouched]
My gut feeling is if you really need a string as identifier, then try
going with memfds. Yes, we might hit some road blocks to be sorted out,
but it just logically makes sense to me: Files have names. These names
exist before mapping and after mapping. They "name" the content.
Maybe it's just me, but the whole interface, setting the name via a
prctl after the mapping was already instantiated doesn't really spark
joy at my end. That's not a strong pushback, but if we can avoid it
using something that's already there, that would be very much preferred.
--
Thanks,
David / dhildenb
On Tue, Oct 12, 2021 at 12:37 AM Michal Hocko <[email protected]> wrote:
>
> On Mon 11-10-21 18:20:25, Suren Baghdasaryan wrote:
> > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > >
> > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > is one needed here? That seems like a completely unreasonable
> > > > > requirement.
> > > >
> > > > I might be misreading the justification for the feature. Patch 2 is
> > > > talking about tools that need to understand memeory usage to make
> > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > argument against.
> > > >
> > > > So can we get a clear example how is this being used actually? If this
> > > > is just to be used to debug by humans than I can see an argument for
> > > > human readable form. If this is, however, meant to be used by tools to
> > > > make some actions then the argument for strings is much weaker.
> > >
> > > The simplest usecase is when we notice that a process consumes more
> > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > check which area is contributing to this growth. The names we assign
> > > to anonymous areas are descriptive enough for a developer to get an
> > > idea where the increased consumption is coming from and how to proceed
> > > with their investigation.
> > > There are of course cases when tools are involved, but the end-user is
> > > always a human and the final report should contain easily
> > > understandable data.
>
> OK, it would have been much more preferable to be explicit about this
> main use case from the very beginning. Just to make sure we are at the
> same page. Is the primary usecase usage and bug reporting?
Sorry, I should have spent more time on patch #2 description. Yes,
debugging memory issues is the primary usecase. In fact that's the
only usecase in Android AFAIK.
>
> My initial understanding was that at userspace managed memory management
> could make an educated guess about targeted reclaim (e.g. MADV_{FREE,COLD,PAGEOUT}
> for cached data in memory like uncompressed images/data). Such a usecase
> would clearly require a standardized id/naming convention to be
> application neutral.
Ah, now I understand your angle. Our prior work on process_madvise()
probably helped in leading your thoughts in this direction :) Sorry
about the confusion.
>
> > > IIUC, the main argument here is whether the userspace can provide
> > > tools to perform the translations between ids and names, with the
> > > kernel accepting and reporting ids instead of strings. Technically
> > > it's possible, but to be practical that conversion should be fast
> > > because we will need to make name->id conversion potentially for each
> > > mmap. On the consumer side the performance is not as critical, but the
> > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > file, do id->name conversion and replace all [anon:id] with
> > > [anon:name] would be an issue when we do that in bulk, for example
> > > when collecting system-wide data for a bugreport.
>
> Whether you use ids or human readable strings you still have to
> understand the underlying meaning to make any educated guess. Let me
> give you an example. Say I have an application with a memory leak. Right
> now I can only tell that it is anonymous memory growing but it is not
> clear who uses that anonymous. You are adding a means to tell different
> users appart. That is really helpful. Now I know this is an anon
> user 1234 or MySuperAnonMemory. Neither of the will not tell me more
> without a id/naming convention or reading the code. A convention can be
> useful for the most common users (e.g. a specific allocator) but I am
> rather dubious there are many more that would be _generally_ recognized
> without some understanding of the said application.
I guess an example would be better to clarify this. Here are some vma
names from Google maps app:
[anon:dalvik-main space (region space)]
[anon:dalvik-/apex/com.android.art/javalib/boot.art]
[anon:dalvik-/apex/com.android.art/javalib/boot-apache-xml.art]
[anon:.bss]
[anon:dalvik-zygote space]
[anon:dalvik-non moving space]
[anon:dalvik-free list large object space]
[anon:dalvik-/product/app/Maps/oat/arm64/Maps.art]
[anon:stack_and_tls:20792]
[anon:stack_and_tls:20791]
[anon:dalvik-LinearAlloc]
[anon:dalvik-CompilerMetadata]
[anon:dalvik-indirect ref table]
[anon:dalvik-live stack]
[anon:dalvik-allocation stack]
[anon:dalvik-large object free list space allocation info map]
[anon:scudo:primary]
[anon:scudo:secondary]
[anon:bionic_alloc_small_objects]
Most of them have names standard for Android and can be recognized by
developers and even Android framework (example where "anon:dalvik-main
space" and other standard names are being parsed:
https://cs.android.com/android/platform/superproject/+/master:frameworks/base/core/jni/android_os_Debug.cpp;l=340).
Names like "anon:dalvik-/apex/com.android.art/javalib/boot.art" help
the developer to recognize the component responsible for the memory.
Names like "anon:stack_and_tls:20792" include the TID of the thread
which uses this memory. All this information can help in narrowing
down memory consumption investigation. Hopefully these examples
clarify the usage a bit better?
>
> Maybe the situation in Android is different because the runtime is more
> coupled but is it reasonable to expect any common naming conventions for
> general Linux platforms?
Well, to be useful the system would have to agree to *some* convention I guess.
>
> I am slightly worried that we have spent way too much time talking
> specifics about id->name translation rather than the actual usability
> of the token.
Agree. I'll try to avoid further confusions.
Thanks!
> --
> Michal Hocko
> SUSE Labs
On Tue, Oct 12, 2021 at 12:44 AM David Hildenbrand <[email protected]> wrote:
>
> > I'm still evaluating the proposal to use memfds but I'm not sure if
> > the issue that David Hildenbrand mentioned about additional memory
> > consumed in pagecache (which has to be addressed) is the only one we
> > will encounter with this approach. If anyone knows of any potential
> > issues with using memfds as named anonymous memory, I would really
> > appreciate your feedback before I go too far in that direction.
>
> [MAP_PRIVATE memfd only behave that way with 4k, not with huge pages, so
> I think it just has to be fixed. It doesn't make any sense to allocate a
> page for the pagecache ("populate the file") when accessing via a
> private mapping that's supposed to leave the file untouched]
>
> My gut feeling is if you really need a string as identifier, then try
> going with memfds. Yes, we might hit some road blocks to be sorted out,
> but it just logically makes sense to me: Files have names. These names
> exist before mapping and after mapping. They "name" the content.
I'm investigating this direction. I don't have much background with
memfds, so I'll need to digest the code first.
>
> Maybe it's just me, but the whole interface, setting the name via a
> prctl after the mapping was already instantiated doesn't really spark
> joy at my end. That's not a strong pushback, but if we can avoid it
> using something that's already there, that would be very much preferred.
Actually that's one of my worries about using memfds. There might be
cases when we need to name a vma after it was mapped. memfd_create()
would not allow us to do that AFAIKT. But I need to check all usages
to say if that's really an issue.
Thanks!
>
> --
> Thanks,
>
> David / dhildenb
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
>
On Mon, Oct 11, 2021 at 10:36:24PM -0700, Suren Baghdasaryan wrote:
> On Mon, Oct 11, 2021 at 8:00 PM Johannes Weiner <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> > > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > > >
> > > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > > >
> > > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > > is one needed here? That seems like a completely unreasonable
> > > > > > requirement.
> > > > >
> > > > > I might be misreading the justification for the feature. Patch 2 is
> > > > > talking about tools that need to understand memeory usage to make
> > > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > > argument against.
> > > > >
> > > > > So can we get a clear example how is this being used actually? If this
> > > > > is just to be used to debug by humans than I can see an argument for
> > > > > human readable form. If this is, however, meant to be used by tools to
> > > > > make some actions then the argument for strings is much weaker.
> > > >
> > > > The simplest usecase is when we notice that a process consumes more
> > > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > > check which area is contributing to this growth. The names we assign
> > > > to anonymous areas are descriptive enough for a developer to get an
> > > > idea where the increased consumption is coming from and how to proceed
> > > > with their investigation.
> > > > There are of course cases when tools are involved, but the end-user is
> > > > always a human and the final report should contain easily
> > > > understandable data.
> > > >
> > > > IIUC, the main argument here is whether the userspace can provide
> > > > tools to perform the translations between ids and names, with the
> > > > kernel accepting and reporting ids instead of strings. Technically
> > > > it's possible, but to be practical that conversion should be fast
> > > > because we will need to make name->id conversion potentially for each
> > > > mmap. On the consumer side the performance is not as critical, but the
> > > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > > file, do id->name conversion and replace all [anon:id] with
> > > > [anon:name] would be an issue when we do that in bulk, for example
> > > > when collecting system-wide data for a bugreport.
> >
> > Is that something you need to do client-side? Or could the bug tool
> > upload the userspace-maintained name:ids database alongside the
> > /proc/pid/maps dump for external processing?
>
> You can generate a bugreport and analyze it locally or submit it as an
> attachment to a bug for further analyzes.
> Sure, we can attach the id->name conversion table to the bugreport but
> either way, some tool would have to post-process it to resolve the
> ids. If we are not analyzing the results immediately then that step
> can be postponed and I think that's what you mean? If so, then yes,
> that is correct.
Right, somebody needs to do it at some point, but I suppose it's less
of a problem if a developer machine does it than a mobile device.
One advantage of an ID over a string - besides not having to maintain
a deduplicating arbitrary string storage in the kernel - is that we
may be able to auto-assign unique IDs to VMAs in the kernel, in a way
that we could not with strings. You'd still have to do IPC calls to
write new name mappings into your db, but you wouldn't have to do the
prctl() to assign stuff in the kernel at all.
(We'd have to think of a solution of how IDs work with vma merging and
splitting, but I think to a certain degree that's policy and we should
be able to find something workable - a MAP_ID flag, using anon_vma as
identity, assigning IDs at mmap time and do merges only for protection
changes etc. etc.)
On Tue, Oct 12, 2021 at 11:26 AM Johannes Weiner <[email protected]> wrote:
>
> On Mon, Oct 11, 2021 at 10:36:24PM -0700, Suren Baghdasaryan wrote:
> > On Mon, Oct 11, 2021 at 8:00 PM Johannes Weiner <[email protected]> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> > > > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > > > >
> > > > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > > > >
> > > > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > > > is one needed here? That seems like a completely unreasonable
> > > > > > > requirement.
> > > > > >
> > > > > > I might be misreading the justification for the feature. Patch 2 is
> > > > > > talking about tools that need to understand memeory usage to make
> > > > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > > > argument against.
> > > > > >
> > > > > > So can we get a clear example how is this being used actually? If this
> > > > > > is just to be used to debug by humans than I can see an argument for
> > > > > > human readable form. If this is, however, meant to be used by tools to
> > > > > > make some actions then the argument for strings is much weaker.
> > > > >
> > > > > The simplest usecase is when we notice that a process consumes more
> > > > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > > > check which area is contributing to this growth. The names we assign
> > > > > to anonymous areas are descriptive enough for a developer to get an
> > > > > idea where the increased consumption is coming from and how to proceed
> > > > > with their investigation.
> > > > > There are of course cases when tools are involved, but the end-user is
> > > > > always a human and the final report should contain easily
> > > > > understandable data.
> > > > >
> > > > > IIUC, the main argument here is whether the userspace can provide
> > > > > tools to perform the translations between ids and names, with the
> > > > > kernel accepting and reporting ids instead of strings. Technically
> > > > > it's possible, but to be practical that conversion should be fast
> > > > > because we will need to make name->id conversion potentially for each
> > > > > mmap. On the consumer side the performance is not as critical, but the
> > > > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > > > file, do id->name conversion and replace all [anon:id] with
> > > > > [anon:name] would be an issue when we do that in bulk, for example
> > > > > when collecting system-wide data for a bugreport.
> > >
> > > Is that something you need to do client-side? Or could the bug tool
> > > upload the userspace-maintained name:ids database alongside the
> > > /proc/pid/maps dump for external processing?
> >
> > You can generate a bugreport and analyze it locally or submit it as an
> > attachment to a bug for further analyzes.
> > Sure, we can attach the id->name conversion table to the bugreport but
> > either way, some tool would have to post-process it to resolve the
> > ids. If we are not analyzing the results immediately then that step
> > can be postponed and I think that's what you mean? If so, then yes,
> > that is correct.
>
> Right, somebody needs to do it at some point, but I suppose it's less
> of a problem if a developer machine does it than a mobile device.
True, and that's why I mentioned that it's not as critical as the
efficiency at mmap() time. In any case, if we could avoid translations
at all that would be ideal.
>
> One advantage of an ID over a string - besides not having to maintain
> a deduplicating arbitrary string storage in the kernel - is that we
> may be able to auto-assign unique IDs to VMAs in the kernel, in a way
> that we could not with strings. You'd still have to do IPC calls to
> write new name mappings into your db, but you wouldn't have to do the
> prctl() to assign stuff in the kernel at all.
You still have to retrieve that tag from the kernel to record it in
your db, so this would still require some syscall, no?
> (We'd have to think of a solution of how IDs work with vma merging and
> splitting, but I think to a certain degree that's policy and we should
> be able to find something workable - a MAP_ID flag, using anon_vma as
> identity, assigning IDs at mmap time and do merges only for protection
> changes etc. etc.)
Overall, I think keeping the kernel out of this and letting it treat
this tag as a cookie which only userspace cares about is simpler.
Unless you see other uses where kernel's involvement is needed.
On Tue, Oct 12, 2021 at 11:52:42AM -0700, Suren Baghdasaryan wrote:
> On Tue, Oct 12, 2021 at 11:26 AM Johannes Weiner <[email protected]> wrote:
> >
> > On Mon, Oct 11, 2021 at 10:36:24PM -0700, Suren Baghdasaryan wrote:
> > > On Mon, Oct 11, 2021 at 8:00 PM Johannes Weiner <[email protected]> wrote:
> > > >
> > > > On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> > > > > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > > > > >
> > > > > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > > > > >
> > > > > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > > > > is one needed here? That seems like a completely unreasonable
> > > > > > > > requirement.
> > > > > > >
> > > > > > > I might be misreading the justification for the feature. Patch 2 is
> > > > > > > talking about tools that need to understand memeory usage to make
> > > > > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > > > > argument against.
> > > > > > >
> > > > > > > So can we get a clear example how is this being used actually? If this
> > > > > > > is just to be used to debug by humans than I can see an argument for
> > > > > > > human readable form. If this is, however, meant to be used by tools to
> > > > > > > make some actions then the argument for strings is much weaker.
> > > > > >
> > > > > > The simplest usecase is when we notice that a process consumes more
> > > > > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > > > > check which area is contributing to this growth. The names we assign
> > > > > > to anonymous areas are descriptive enough for a developer to get an
> > > > > > idea where the increased consumption is coming from and how to proceed
> > > > > > with their investigation.
> > > > > > There are of course cases when tools are involved, but the end-user is
> > > > > > always a human and the final report should contain easily
> > > > > > understandable data.
> > > > > >
> > > > > > IIUC, the main argument here is whether the userspace can provide
> > > > > > tools to perform the translations between ids and names, with the
> > > > > > kernel accepting and reporting ids instead of strings. Technically
> > > > > > it's possible, but to be practical that conversion should be fast
> > > > > > because we will need to make name->id conversion potentially for each
> > > > > > mmap. On the consumer side the performance is not as critical, but the
> > > > > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > > > > file, do id->name conversion and replace all [anon:id] with
> > > > > > [anon:name] would be an issue when we do that in bulk, for example
> > > > > > when collecting system-wide data for a bugreport.
> > > >
> > > > Is that something you need to do client-side? Or could the bug tool
> > > > upload the userspace-maintained name:ids database alongside the
> > > > /proc/pid/maps dump for external processing?
> > >
> > > You can generate a bugreport and analyze it locally or submit it as an
> > > attachment to a bug for further analyzes.
> > > Sure, we can attach the id->name conversion table to the bugreport but
> > > either way, some tool would have to post-process it to resolve the
> > > ids. If we are not analyzing the results immediately then that step
> > > can be postponed and I think that's what you mean? If so, then yes,
> > > that is correct.
> >
> > Right, somebody needs to do it at some point, but I suppose it's less
> > of a problem if a developer machine does it than a mobile device.
>
> True, and that's why I mentioned that it's not as critical as the
> efficiency at mmap() time. In any case, if we could avoid translations
> at all that would be ideal.
>
> >
> > One advantage of an ID over a string - besides not having to maintain
> > a deduplicating arbitrary string storage in the kernel - is that we
> > may be able to auto-assign unique IDs to VMAs in the kernel, in a way
> > that we could not with strings. You'd still have to do IPC calls to
> > write new name mappings into your db, but you wouldn't have to do the
> > prctl() to assign stuff in the kernel at all.
>
> You still have to retrieve that tag from the kernel to record it in
> your db, so this would still require some syscall, no?
Don't you have to do this with the string setting interface as well?
How do you know the vma address to pass into the prctl()? Is this
somehow coordinated with the mmap()?
> > (We'd have to think of a solution of how IDs work with vma merging and
> > splitting, but I think to a certain degree that's policy and we should
> > be able to find something workable - a MAP_ID flag, using anon_vma as
> > identity, assigning IDs at mmap time and do merges only for protection
> > changes etc. etc.)
>
> Overall, I think keeping the kernel out of this and letting it treat
> this tag as a cookie which only userspace cares about is simpler.
> Unless you see other uses where kernel's involvement is needed.
It depends on what you consider keeping the kernel out of it. A small
extension to assign unique IDs to mappings automatically in an
intuitive way (with a compat option to disable) is a much smaller ABI
commitment than a prctl()-controlled string storage.
When I say policy on how to assign the ID, I didn't mean that it
should be a free for all. Rather that we should pick one reasonable
way to do it, comparable to picking the parameters for how long the
stored strings could be, which characters to allow etc.
On Tue, Oct 12, 2021 at 1:41 PM Johannes Weiner <[email protected]> wrote:
>
> On Tue, Oct 12, 2021 at 11:52:42AM -0700, Suren Baghdasaryan wrote:
> > On Tue, Oct 12, 2021 at 11:26 AM Johannes Weiner <[email protected]> wrote:
> > >
> > > On Mon, Oct 11, 2021 at 10:36:24PM -0700, Suren Baghdasaryan wrote:
> > > > On Mon, Oct 11, 2021 at 8:00 PM Johannes Weiner <[email protected]> wrote:
> > > > >
> > > > > On Mon, Oct 11, 2021 at 06:20:25PM -0700, Suren Baghdasaryan wrote:
> > > > > > On Mon, Oct 11, 2021 at 6:18 PM Suren Baghdasaryan <[email protected]> wrote:
> > > > > > >
> > > > > > > On Mon, Oct 11, 2021 at 1:36 AM Michal Hocko <[email protected]> wrote:
> > > > > > > >
> > > > > > > > On Fri 08-10-21 13:58:01, Kees Cook wrote:
> > > > > > > > > - Strings for "anon" specifically have no required format (this is good)
> > > > > > > > > it's informational like the task_struct::comm and can (roughly)
> > > > > > > > > anything. There's no naming convention for memfds, AF_UNIX, etc. Why
> > > > > > > > > is one needed here? That seems like a completely unreasonable
> > > > > > > > > requirement.
> > > > > > > >
> > > > > > > > I might be misreading the justification for the feature. Patch 2 is
> > > > > > > > talking about tools that need to understand memeory usage to make
> > > > > > > > further actions. Also Suren was suggesting "numbering convetion" as an
> > > > > > > > argument against.
> > > > > > > >
> > > > > > > > So can we get a clear example how is this being used actually? If this
> > > > > > > > is just to be used to debug by humans than I can see an argument for
> > > > > > > > human readable form. If this is, however, meant to be used by tools to
> > > > > > > > make some actions then the argument for strings is much weaker.
> > > > > > >
> > > > > > > The simplest usecase is when we notice that a process consumes more
> > > > > > > memory than usual and we do "cat /proc/$(pidof my_process)/maps" to
> > > > > > > check which area is contributing to this growth. The names we assign
> > > > > > > to anonymous areas are descriptive enough for a developer to get an
> > > > > > > idea where the increased consumption is coming from and how to proceed
> > > > > > > with their investigation.
> > > > > > > There are of course cases when tools are involved, but the end-user is
> > > > > > > always a human and the final report should contain easily
> > > > > > > understandable data.
> > > > > > >
> > > > > > > IIUC, the main argument here is whether the userspace can provide
> > > > > > > tools to perform the translations between ids and names, with the
> > > > > > > kernel accepting and reporting ids instead of strings. Technically
> > > > > > > it's possible, but to be practical that conversion should be fast
> > > > > > > because we will need to make name->id conversion potentially for each
> > > > > > > mmap. On the consumer side the performance is not as critical, but the
> > > > > > > fact that instead of dumping /proc/$pid/maps we will have to parse the
> > > > > > > file, do id->name conversion and replace all [anon:id] with
> > > > > > > [anon:name] would be an issue when we do that in bulk, for example
> > > > > > > when collecting system-wide data for a bugreport.
> > > > >
> > > > > Is that something you need to do client-side? Or could the bug tool
> > > > > upload the userspace-maintained name:ids database alongside the
> > > > > /proc/pid/maps dump for external processing?
> > > >
> > > > You can generate a bugreport and analyze it locally or submit it as an
> > > > attachment to a bug for further analyzes.
> > > > Sure, we can attach the id->name conversion table to the bugreport but
> > > > either way, some tool would have to post-process it to resolve the
> > > > ids. If we are not analyzing the results immediately then that step
> > > > can be postponed and I think that's what you mean? If so, then yes,
> > > > that is correct.
> > >
> > > Right, somebody needs to do it at some point, but I suppose it's less
> > > of a problem if a developer machine does it than a mobile device.
> >
> > True, and that's why I mentioned that it's not as critical as the
> > efficiency at mmap() time. In any case, if we could avoid translations
> > at all that would be ideal.
> >
> > >
> > > One advantage of an ID over a string - besides not having to maintain
> > > a deduplicating arbitrary string storage in the kernel - is that we
> > > may be able to auto-assign unique IDs to VMAs in the kernel, in a way
> > > that we could not with strings. You'd still have to do IPC calls to
> > > write new name mappings into your db, but you wouldn't have to do the
> > > prctl() to assign stuff in the kernel at all.
> >
> > You still have to retrieve that tag from the kernel to record it in
> > your db, so this would still require some syscall, no?
>
> Don't you have to do this with the string setting interface as well?
> How do you know the vma address to pass into the prctl()? Is this
> somehow coordinated with the mmap()?
Sure. The sequence is:
ptr = mmap(NULL, size, ...);
prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, name);
>
> > > (We'd have to think of a solution of how IDs work with vma merging and
> > > splitting, but I think to a certain degree that's policy and we should
> > > be able to find something workable - a MAP_ID flag, using anon_vma as
> > > identity, assigning IDs at mmap time and do merges only for protection
> > > changes etc. etc.)
> >
> > Overall, I think keeping the kernel out of this and letting it treat
> > this tag as a cookie which only userspace cares about is simpler.
> > Unless you see other uses where kernel's involvement is needed.
>
> It depends on what you consider keeping the kernel out of it. A small
> extension to assign unique IDs to mappings automatically in an
> intuitive way (with a compat option to disable) is a much smaller ABI
> commitment than a prctl()-controlled string storage.
I'm not saying it's hard or complex. I just don't see the advantage of
generating these IDs in the kernel vs passing them from userspace.
Maybe I'm missing some usecase?
> When I say policy on how to assign the ID, I didn't mean that it
> should be a free for all. Rather that we should pick one reasonable
> way to do it, comparable to picking the parameters for how long the
> stored strings could be, which characters to allow etc.
On Tue, Oct 12, 2021 at 10:01 AM Suren Baghdasaryan <[email protected]> wrote:
>
> On Tue, Oct 12, 2021 at 12:44 AM David Hildenbrand <[email protected]> wrote:
> >
> > > I'm still evaluating the proposal to use memfds but I'm not sure if
> > > the issue that David Hildenbrand mentioned about additional memory
> > > consumed in pagecache (which has to be addressed) is the only one we
> > > will encounter with this approach. If anyone knows of any potential
> > > issues with using memfds as named anonymous memory, I would really
> > > appreciate your feedback before I go too far in that direction.
> >
> > [MAP_PRIVATE memfd only behave that way with 4k, not with huge pages, so
> > I think it just has to be fixed. It doesn't make any sense to allocate a
> > page for the pagecache ("populate the file") when accessing via a
> > private mapping that's supposed to leave the file untouched]
> >
> > My gut feeling is if you really need a string as identifier, then try
> > going with memfds. Yes, we might hit some road blocks to be sorted out,
> > but it just logically makes sense to me: Files have names. These names
> > exist before mapping and after mapping. They "name" the content.
>
> I'm investigating this direction. I don't have much background with
> memfds, so I'll need to digest the code first.
I've done some investigation into the possibility of using memfds to
name anonymous VMAs. Here are my findings:
1. Forking a process with anonymous vmas named using memfd is 5-15%
slower than with prctl (depends on the number of VMAs in the process
being forked). Profiling shows that i_mmap_lock_write() dominates
dup_mmap(). Exit path is also slower by roughly 9% with
free_pgtables() and fput() dominating exit_mmap(). Fork performance is
important for Android because almost all processes are forked from
zygote, therefore this limitation already makes this approach
prohibitive.
2. mremap() usage to grow the mapping has an issue when used with memfds:
fd = memfd_create(name, MFD_ALLOW_SEALING);
ftruncate(fd, size_bytes);
ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
close(fd);
ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
touch_mem(ptr, size_bytes * 2);
This would generate a SIGBUS in touch_mem(). I believe it's because
ftruncate() specified the size to be size_bytes and we are accessing
more than that after remapping. prctl() does not have this limitation
and we do have a usecase for growing a named VMA.
3. Leaves an fd exposed, even briefly, which may lead to unexpected
flaws (e.g. anything using mmap MAP_SHARED could allow exposures or
overwrites). Even MAP_PRIVATE, if an attacker writes into the file
after ftruncate() and before mmap(), can cause private memory to be
initialized with unexpected data.
4. There is a usecase in the Android userspace where vma naming
happens after memory was allocated. Bionic linker does in-memory
relocations and then names some relocated sections.
In the light of these findings, could the current patchset be reconsidered?
Thanks,
Suren.
>
> >
> > Maybe it's just me, but the whole interface, setting the name via a
> > prctl after the mapping was already instantiated doesn't really spark
> > joy at my end. That's not a strong pushback, but if we can avoid it
> > using something that's already there, that would be very much preferred.
>
> Actually that's one of my worries about using memfds. There might be
> cases when we need to name a vma after it was mapped. memfd_create()
> would not allow us to do that AFAIKT. But I need to check all usages
> to say if that's really an issue.
> Thanks!
>
> >
> > --
> > Thanks,
> >
> > David / dhildenb
> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to [email protected].
> >
On 14.10.21 22:16, Suren Baghdasaryan wrote:
> On Tue, Oct 12, 2021 at 10:01 AM Suren Baghdasaryan <[email protected]> wrote:
>>
>> On Tue, Oct 12, 2021 at 12:44 AM David Hildenbrand <[email protected]> wrote:
>>>
>>>> I'm still evaluating the proposal to use memfds but I'm not sure if
>>>> the issue that David Hildenbrand mentioned about additional memory
>>>> consumed in pagecache (which has to be addressed) is the only one we
>>>> will encounter with this approach. If anyone knows of any potential
>>>> issues with using memfds as named anonymous memory, I would really
>>>> appreciate your feedback before I go too far in that direction.
>>>
>>> [MAP_PRIVATE memfd only behave that way with 4k, not with huge pages, so
>>> I think it just has to be fixed. It doesn't make any sense to allocate a
>>> page for the pagecache ("populate the file") when accessing via a
>>> private mapping that's supposed to leave the file untouched]
>>>
>>> My gut feeling is if you really need a string as identifier, then try
>>> going with memfds. Yes, we might hit some road blocks to be sorted out,
>>> but it just logically makes sense to me: Files have names. These names
>>> exist before mapping and after mapping. They "name" the content.
>>
>> I'm investigating this direction. I don't have much background with
>> memfds, so I'll need to digest the code first.
>
> I've done some investigation into the possibility of using memfds to
> name anonymous VMAs. Here are my findings:
Thanks for exploring the alternatives!
>
> 1. Forking a process with anonymous vmas named using memfd is 5-15%
> slower than with prctl (depends on the number of VMAs in the process
> being forked). Profiling shows that i_mmap_lock_write() dominates
> dup_mmap(). Exit path is also slower by roughly 9% with
> free_pgtables() and fput() dominating exit_mmap(). Fork performance is
> important for Android because almost all processes are forked from
> zygote, therefore this limitation already makes this approach
> prohibitive.
Interesting, naturally I wonder if that can be optimized.
>
> 2. mremap() usage to grow the mapping has an issue when used with memfds:
>
> fd = memfd_create(name, MFD_ALLOW_SEALING);
> ftruncate(fd, size_bytes);
> ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
> close(fd);
> ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
> touch_mem(ptr, size_bytes * 2);
>
> This would generate a SIGBUS in touch_mem(). I believe it's because
> ftruncate() specified the size to be size_bytes and we are accessing
> more than that after remapping. prctl() does not have this limitation
> and we do have a usecase for growing a named VMA.
Can't you simply size the memfd much larger? I mean, it doesn't really
cost much, does it?
>
> 3. Leaves an fd exposed, even briefly, which may lead to unexpected
> flaws (e.g. anything using mmap MAP_SHARED could allow exposures or
> overwrites). Even MAP_PRIVATE, if an attacker writes into the file
> after ftruncate() and before mmap(), can cause private memory to be
> initialized with unexpected data.
I don't quite follow. Can you elaborate what exactly the issue here is?
We use a temporary fd, yes, but how is that a problem?
Any attacker can just write any random memory memory in the address
space, so I don't see the issue.
>
> 4. There is a usecase in the Android userspace where vma naming
> happens after memory was allocated. Bionic linker does in-memory
> relocations and then names some relocated sections.
Would renaming a memfd be an option or is that "too late" ?
>
> In the light of these findings, could the current patchset be reconsidered?
> Thanks,
> Suren.
>
--
Thanks,
David / dhildenb
On Fri, Oct 15, 2021 at 1:04 AM David Hildenbrand <[email protected]> wrote:
>
> On 14.10.21 22:16, Suren Baghdasaryan wrote:
> > On Tue, Oct 12, 2021 at 10:01 AM Suren Baghdasaryan <[email protected]> wrote:
> >>
> >> On Tue, Oct 12, 2021 at 12:44 AM David Hildenbrand <[email protected]> wrote:
> >>>
> >>>> I'm still evaluating the proposal to use memfds but I'm not sure if
> >>>> the issue that David Hildenbrand mentioned about additional memory
> >>>> consumed in pagecache (which has to be addressed) is the only one we
> >>>> will encounter with this approach. If anyone knows of any potential
> >>>> issues with using memfds as named anonymous memory, I would really
> >>>> appreciate your feedback before I go too far in that direction.
> >>>
> >>> [MAP_PRIVATE memfd only behave that way with 4k, not with huge pages, so
> >>> I think it just has to be fixed. It doesn't make any sense to allocate a
> >>> page for the pagecache ("populate the file") when accessing via a
> >>> private mapping that's supposed to leave the file untouched]
> >>>
> >>> My gut feeling is if you really need a string as identifier, then try
> >>> going with memfds. Yes, we might hit some road blocks to be sorted out,
> >>> but it just logically makes sense to me: Files have names. These names
> >>> exist before mapping and after mapping. They "name" the content.
> >>
> >> I'm investigating this direction. I don't have much background with
> >> memfds, so I'll need to digest the code first.
> >
> > I've done some investigation into the possibility of using memfds to
> > name anonymous VMAs. Here are my findings:
>
> Thanks for exploring the alternatives!
Thanks for pointing to them!
>
> >
> > 1. Forking a process with anonymous vmas named using memfd is 5-15%
> > slower than with prctl (depends on the number of VMAs in the process
> > being forked). Profiling shows that i_mmap_lock_write() dominates
> > dup_mmap(). Exit path is also slower by roughly 9% with
> > free_pgtables() and fput() dominating exit_mmap(). Fork performance is
> > important for Android because almost all processes are forked from
> > zygote, therefore this limitation already makes this approach
> > prohibitive.
>
> Interesting, naturally I wonder if that can be optimized.
Maybe but it looks like we simply do additional things for file-backed
memory, which seems natural. The call to i_mmap_lock_write() is from
here: https://elixir.bootlin.com/linux/latest/source/kernel/fork.c#L565
>
> >
> > 2. mremap() usage to grow the mapping has an issue when used with memfds:
> >
> > fd = memfd_create(name, MFD_ALLOW_SEALING);
> > ftruncate(fd, size_bytes);
> > ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
> > close(fd);
> > ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
> > touch_mem(ptr, size_bytes * 2);
> >
> > This would generate a SIGBUS in touch_mem(). I believe it's because
> > ftruncate() specified the size to be size_bytes and we are accessing
> > more than that after remapping. prctl() does not have this limitation
> > and we do have a usecase for growing a named VMA.
>
> Can't you simply size the memfd much larger? I mean, it doesn't really
> cost much, does it?
If we know beforehand what the max size it can reach then that would
be possible. I would really hate to miscalculate here and cause a
simple memory access to generate signals. Tracking such corner cases
in the field is not an easy task and I would rather avoid the
possibility of it.
>
> >
> > 3. Leaves an fd exposed, even briefly, which may lead to unexpected
> > flaws (e.g. anything using mmap MAP_SHARED could allow exposures or
> > overwrites). Even MAP_PRIVATE, if an attacker writes into the file
> > after ftruncate() and before mmap(), can cause private memory to be
> > initialized with unexpected data.
>
> I don't quite follow. Can you elaborate what exactly the issue here is?
> We use a temporary fd, yes, but how is that a problem?
>
> Any attacker can just write any random memory memory in the address
> space, so I don't see the issue.
It feels to me that introducing another handle to the memory region is
a potential attack vector but I'm not a security expert. Maybe Kees
can assess this better?
>
> >
> > 4. There is a usecase in the Android userspace where vma naming
> > happens after memory was allocated. Bionic linker does in-memory
> > relocations and then names some relocated sections.
>
> Would renaming a memfd be an option or is that "too late" ?
My understanding is that linker allocates space to load and relocate
the code, performs the relocations in that space and then names some
of the regions after that. Whether it can be redesigned to allocate
multiple named regions and perform the relocation between them I did
not really try since it would be a project by itself.
TBH, at some point I just look at the amount of required changes (both
kernel and userspace) and new limitations that userspace has to adhere
to for fitting memfds to my usecase, and I feel that it's just not
worth it. In the end we end up using the same refcounted strings with
vma->vm_file->f_count as the refcount and name stored in
vma->vm_file->f_path->dentry but with more overhead.
Thanks,
Suren.
>
> >
> > In the light of these findings, could the current patchset be reconsidered?
> > Thanks,
> > Suren.
> >
>
>
> --
> Thanks,
>
> David / dhildenb
>
>>>
>>> 1. Forking a process with anonymous vmas named using memfd is 5-15%
>>> slower than with prctl (depends on the number of VMAs in the process
>>> being forked). Profiling shows that i_mmap_lock_write() dominates
>>> dup_mmap(). Exit path is also slower by roughly 9% with
>>> free_pgtables() and fput() dominating exit_mmap(). Fork performance is
>>> important for Android because almost all processes are forked from
>>> zygote, therefore this limitation already makes this approach
>>> prohibitive.
>>
>> Interesting, naturally I wonder if that can be optimized.
>
> Maybe but it looks like we simply do additional things for file-backed
> memory, which seems natural. The call to i_mmap_lock_write() is from
> here: https://elixir.bootlin.com/linux/latest/source/kernel/fork.c#L565
>
>>
>>>
>>> 2. mremap() usage to grow the mapping has an issue when used with memfds:
>>>
>>> fd = memfd_create(name, MFD_ALLOW_SEALING);
>>> ftruncate(fd, size_bytes);
>>> ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
>>> close(fd);
>>> ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
>>> touch_mem(ptr, size_bytes * 2);
>>>
>>> This would generate a SIGBUS in touch_mem(). I believe it's because
>>> ftruncate() specified the size to be size_bytes and we are accessing
>>> more than that after remapping. prctl() does not have this limitation
>>> and we do have a usecase for growing a named VMA.
>>
>> Can't you simply size the memfd much larger? I mean, it doesn't really
>> cost much, does it?
>
> If we know beforehand what the max size it can reach then that would
> be possible. I would really hate to miscalculate here and cause a
> simple memory access to generate signals. Tracking such corner cases
> in the field is not an easy task and I would rather avoid the
> possibility of it.
The question would be if you cannot simply add some extremely large
number, because the file size itself doesn't really matter for memfd IIRC.
Having that said, without trying it out, I wouldn't know from the top of
my head if memremap would work that way on an already closed fd that ahs
a sufficient size :/ If you have the example still somewhere, I would be
interested if that would work in general.
[...]
>>
>>>
>>> 4. There is a usecase in the Android userspace where vma naming
>>> happens after memory was allocated. Bionic linker does in-memory
>>> relocations and then names some relocated sections.
>>
>> Would renaming a memfd be an option or is that "too late" ?
>
> My understanding is that linker allocates space to load and relocate
> the code, performs the relocations in that space and then names some
> of the regions after that. Whether it can be redesigned to allocate
> multiple named regions and perform the relocation between them I did
> not really try since it would be a project by itself.
>
> TBH, at some point I just look at the amount of required changes (both
> kernel and userspace) and new limitations that userspace has to adhere
> to for fitting memfds to my usecase, and I feel that it's just not
> worth it. In the end we end up using the same refcounted strings with
> vma->vm_file->f_count as the refcount and name stored in
> vma->vm_file->f_path->dentry but with more overhead.
Yes, but it's glued to files which naturally have names :)
Again, I appreciate that you looked into alternatives! I can see the
late renaming could be the biggest blocker if user space cannot be
adjusted easily to be compatible with that using memfds.
--
Thanks,
David / dhildenb
On Fri, Oct 15, 2021 at 09:30:09AM -0700, Suren Baghdasaryan wrote:
> On Fri, Oct 15, 2021 at 1:04 AM David Hildenbrand <[email protected]> wrote:
> >
> > On 14.10.21 22:16, Suren Baghdasaryan wrote:
> > > [...]
> > > 3. Leaves an fd exposed, even briefly, which may lead to unexpected
> > > flaws (e.g. anything using mmap MAP_SHARED could allow exposures or
> > > overwrites). Even MAP_PRIVATE, if an attacker writes into the file
> > > after ftruncate() and before mmap(), can cause private memory to be
> > > initialized with unexpected data.
> >
> > I don't quite follow. Can you elaborate what exactly the issue here is?
> > We use a temporary fd, yes, but how is that a problem?
> >
> > Any attacker can just write any random memory memory in the address
> > space, so I don't see the issue.
>
> It feels to me that introducing another handle to the memory region is
> a potential attack vector but I'm not a security expert. Maybe Kees
> can assess this better?
This case is kind of just an extension of "we don't need an fd, we need
a name". There is a lot of resulting baggage suddenly added to using
anonymous VMA (fork overhead to deal with the fds, etc), but for me, this
particular situation above is what really demonstrates the "unexpected
side-effects" of trying to swap an anonymous mmap for a memfd: there is
now an _external handle_ attached to memory that doesn't pass through
any of the existing security boundaries normally associated with process
memory (i.e. ptrace). Here's the example race:
victim process attacker process (same uid)
memfd_create(name, flags);
-> /proc/$pid/fd/3
ftruncate(3, size);
open("/proc/$victim/fd/3", O_RDWR)
-> 3
mmap(NULL, size,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_SHARED, 3, 0);
-> addr
memset(addr, 0xFF, size);
mmap(NULL, size, prot,
MAP_PRIVATE, 3, 0);
-> addr
close(3);
surprise, addr[0] != 0x00
And again, yes, we could program defensively, but it's a surprising
situation with new corner cases that haven't been present for years of
Just Using Anon VMAs. :) I would be worried about other vectors we
haven't imagined yet.
So, I think between both the overhead of files and the expanded attack
surface make memfd unsuited for this use-case.
-Kees
--
Kees Cook
On Fri, Oct 15, 2021 at 9:39 AM David Hildenbrand <[email protected]> wrote:
>
>
> >>>
> >>> 1. Forking a process with anonymous vmas named using memfd is 5-15%
> >>> slower than with prctl (depends on the number of VMAs in the process
> >>> being forked). Profiling shows that i_mmap_lock_write() dominates
> >>> dup_mmap(). Exit path is also slower by roughly 9% with
> >>> free_pgtables() and fput() dominating exit_mmap(). Fork performance is
> >>> important for Android because almost all processes are forked from
> >>> zygote, therefore this limitation already makes this approach
> >>> prohibitive.
> >>
> >> Interesting, naturally I wonder if that can be optimized.
> >
> > Maybe but it looks like we simply do additional things for file-backed
> > memory, which seems natural. The call to i_mmap_lock_write() is from
> > here: https://elixir.bootlin.com/linux/latest/source/kernel/fork.c#L565
> >
> >>
> >>>
> >>> 2. mremap() usage to grow the mapping has an issue when used with memfds:
> >>>
> >>> fd = memfd_create(name, MFD_ALLOW_SEALING);
> >>> ftruncate(fd, size_bytes);
> >>> ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
> >>> close(fd);
> >>> ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
> >>> touch_mem(ptr, size_bytes * 2);
> >>>
> >>> This would generate a SIGBUS in touch_mem(). I believe it's because
> >>> ftruncate() specified the size to be size_bytes and we are accessing
> >>> more than that after remapping. prctl() does not have this limitation
> >>> and we do have a usecase for growing a named VMA.
> >>
> >> Can't you simply size the memfd much larger? I mean, it doesn't really
> >> cost much, does it?
> >
> > If we know beforehand what the max size it can reach then that would
> > be possible. I would really hate to miscalculate here and cause a
> > simple memory access to generate signals. Tracking such corner cases
> > in the field is not an easy task and I would rather avoid the
> > possibility of it.
>
> The question would be if you cannot simply add some extremely large
> number, because the file size itself doesn't really matter for memfd IIRC.
>
> Having that said, without trying it out, I wouldn't know from the top of
> my head if memremap would work that way on an already closed fd that ahs
> a sufficient size :/ If you have the example still somewhere, I would be
> interested if that would work in general.
Yes, I tried a simple test like this and it works:
fd = memfd_create(name, MFD_ALLOW_SEALING);
ftruncate(fd, size_bytes * 2);
ptr = mmap(NULL, size_bytes, prot, MAP_PRIVATE, fd, 0);
close(fd);
ptr = mremap(ptr, size_bytes, size_bytes * 2, MREMAP_MAYMOVE);
touch_mem(ptr, size_bytes * 2);
I understand your suggestion but it's just another hoop we have to
jump to make this work and feels unnatural from userspace POV. Also
virtual address space exhaustion might be an issue for 32bit userspace
with this approach.
>
> [...]
>
> >>
> >>>
> >>> 4. There is a usecase in the Android userspace where vma naming
> >>> happens after memory was allocated. Bionic linker does in-memory
> >>> relocations and then names some relocated sections.
> >>
> >> Would renaming a memfd be an option or is that "too late" ?
> >
> > My understanding is that linker allocates space to load and relocate
> > the code, performs the relocations in that space and then names some
> > of the regions after that. Whether it can be redesigned to allocate
> > multiple named regions and perform the relocation between them I did
> > not really try since it would be a project by itself.
> >
> > TBH, at some point I just look at the amount of required changes (both
> > kernel and userspace) and new limitations that userspace has to adhere
> > to for fitting memfds to my usecase, and I feel that it's just not
> > worth it. In the end we end up using the same refcounted strings with
> > vma->vm_file->f_count as the refcount and name stored in
> > vma->vm_file->f_path->dentry but with more overhead.
>
> Yes, but it's glued to files which naturally have names :)
Yeah, I understand your motivations and that's why I'm exploring these
possibilities but it proves to be just too costly for a feature as
simple as naming a vma :)
>
> Again, I appreciate that you looked into alternatives! I can see the
> late renaming could be the biggest blocker if user space cannot be
> adjusted easily to be compatible with that using memfds.
Yeah, it would definitely be hard for Android to adopt this.
If there are no objections to the current approach I would like to
respin another version with the CONFIG option added sometime early
next week. If anyone has objections, please let me know.
Thanks,
Suren.
>
> --
> Thanks,
>
> David / dhildenb
>