Makes red black tree allocation more flexible on a per-architecture
basis by introducing an optional hooks to refine the red-black tree
structuring and exposing vmalloc functions for clipping vmap areas,
finding vmap areas, and inserting vmap areas.
With this patch, the red-black vmap tree can be refined to account for
architecture-specific memory management operations, most notably address
space layout randomization, as these features conflict with generic
management of a single vmalloc_start to vmalloc_end range as given by
mm/vmalloc.c.
For example, x86 is forced to restrict aslr to 1024 possible locations,
which is a very, very small number, and arm64 breaks standard code/data
partitioning altogether, which prevents the enforcement of performant
immmutability on kernel page tables.
Signed-off-by: Maxwell Bland <[email protected]>
---
include/linux/vmalloc.h | 24 ++++++++++++++++++++++++
mm/vmalloc.c | 16 ++++++++++------
2 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 98ea90e90439..3c5ce7ee0bea 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -12,6 +12,7 @@
#include <asm/vmalloc.h>
+struct kmem_cache;
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
struct notifier_block; /* in notifier.h */
struct iov_iter; /* in uio.h */
@@ -125,6 +126,21 @@ static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
}
#endif
+#ifndef arch_skip_va
+static inline bool arch_skip_va(struct vmap_area *va, unsigned long vstart)
+{
+ return false;
+}
+#endif
+
+#ifndef arch_refine_vmap_space
+static inline void arch_refine_vmap_space(struct rb_root *root,
+ struct list_head *head,
+ struct kmem_cache *cachep)
+{
+}
+#endif
+
/*
* Highlevel APIs for driver use
*/
@@ -214,6 +230,14 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
+extern void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from,
+ struct rb_root *root,
+ struct list_head *head);
+extern int va_clip(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size);
+extern struct vmap_area *__find_vmap_area(unsigned long addr,
+ struct rb_root *root);
struct vmap_area *find_vmap_area(unsigned long addr);
static inline bool is_vm_area_hugepages(const void *addr)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 68fa001648cc..de4577a3708e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -989,7 +989,7 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
-static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
+struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
struct rb_node *n = root->rb_node;
@@ -1322,7 +1322,7 @@ insert_vmap_area(struct vmap_area *va,
link_va(va, root, parent, link, head);
}
-static void
+void
insert_vmap_area_augment(struct vmap_area *va,
struct rb_node *from, struct rb_root *root,
struct list_head *head)
@@ -1501,7 +1501,7 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
vstart < va->va_start) {
node = node->rb_left;
} else {
- if (is_within_this_va(va, size, align, vstart))
+ if (!arch_skip_va(va, vstart) && is_within_this_va(va, size, align, vstart))
return va;
/*
@@ -1522,7 +1522,8 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
*/
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
- if (is_within_this_va(va, size, align, vstart))
+ if (!arch_skip_va(va, vstart) &&
+ is_within_this_va(va, size, align, vstart))
return va;
if (get_subtree_max_size(node->rb_right) >= length &&
@@ -1554,7 +1555,7 @@ find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
struct vmap_area *va;
list_for_each_entry(va, head, list) {
- if (!is_within_this_va(va, size, align, vstart))
+ if (arch_skip_va(va, vstart) || !is_within_this_va(va, size, align, vstart))
continue;
return va;
@@ -1617,7 +1618,7 @@ classify_va_fit_type(struct vmap_area *va,
return type;
}
-static __always_inline int
+__always_inline int
va_clip(struct rb_root *root, struct list_head *head,
struct vmap_area *va, unsigned long nva_start_addr,
unsigned long size)
@@ -5129,4 +5130,7 @@ void __init vmalloc_init(void)
vmap_node_shrinker->count_objects = vmap_node_shrink_count;
vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
shrinker_register(vmap_node_shrinker);
+
+ arch_refine_vmap_space(&free_vmap_area_root, &free_vmap_area_list,
+ vmap_area_cachep);
}
--
2.39.2
On Tue, Apr 02, 2024 at 03:15:01PM -0500, Maxwell Bland wrote:
> Makes red black tree allocation more flexible on a per-architecture
> basis by introducing an optional hooks to refine the red-black tree
> structuring and exposing vmalloc functions for clipping vmap areas,
> finding vmap areas, and inserting vmap areas.
>
> With this patch, the red-black vmap tree can be refined to account for
> architecture-specific memory management operations, most notably address
> space layout randomization, as these features conflict with generic
> management of a single vmalloc_start to vmalloc_end range as given by
> mm/vmalloc.c.
>
> For example, x86 is forced to restrict aslr to 1024 possible locations,
> which is a very, very small number, and arm64 breaks standard code/data
> partitioning altogether, which prevents the enforcement of performant
> immmutability on kernel page tables.
>
> Signed-off-by: Maxwell Bland <[email protected]>
> ---
> include/linux/vmalloc.h | 24 ++++++++++++++++++++++++
> mm/vmalloc.c | 16 ++++++++++------
> 2 files changed, 34 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 98ea90e90439..3c5ce7ee0bea 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -12,6 +12,7 @@
>
> #include <asm/vmalloc.h>
>
> +struct kmem_cache;
> struct vm_area_struct; /* vma defining user mapping in mm_types.h */
> struct notifier_block; /* in notifier.h */
> struct iov_iter; /* in uio.h */
> @@ -125,6 +126,21 @@ static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
> }
> #endif
>
> +#ifndef arch_skip_va
> +static inline bool arch_skip_va(struct vmap_area *va, unsigned long vstart)
> +{
> + return false;
> +}
> +#endif
> +
> +#ifndef arch_refine_vmap_space
> +static inline void arch_refine_vmap_space(struct rb_root *root,
> + struct list_head *head,
> + struct kmem_cache *cachep)
> +{
> +}
> +#endif
> +
> /*
> * Highlevel APIs for driver use
> */
> @@ -214,6 +230,14 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size,
> void free_vm_area(struct vm_struct *area);
> extern struct vm_struct *remove_vm_area(const void *addr);
> extern struct vm_struct *find_vm_area(const void *addr);
> +extern void insert_vmap_area_augment(struct vmap_area *va, struct rb_node *from,
> + struct rb_root *root,
> + struct list_head *head);
> +extern int va_clip(struct rb_root *root, struct list_head *head,
> + struct vmap_area *va, unsigned long nva_start_addr,
> + unsigned long size);
> +extern struct vmap_area *__find_vmap_area(unsigned long addr,
> + struct rb_root *root);
>
To me it looks like you want to make internal functions as public for
everyone which is not good, imho.
> struct vmap_area *find_vmap_area(unsigned long addr);
>
> static inline bool is_vm_area_hugepages(const void *addr)
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 68fa001648cc..de4577a3708e 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -989,7 +989,7 @@ unsigned long vmalloc_nr_pages(void)
> return atomic_long_read(&nr_vmalloc_pages);
> }
>
> -static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
> +struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
> {
> struct rb_node *n = root->rb_node;
>
> @@ -1322,7 +1322,7 @@ insert_vmap_area(struct vmap_area *va,
> link_va(va, root, parent, link, head);
> }
>
> -static void
> +void
> insert_vmap_area_augment(struct vmap_area *va,
> struct rb_node *from, struct rb_root *root,
> struct list_head *head)
> @@ -1501,7 +1501,7 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
> vstart < va->va_start) {
> node = node->rb_left;
> } else {
> - if (is_within_this_va(va, size, align, vstart))
> + if (!arch_skip_va(va, vstart) && is_within_this_va(va, size, align, vstart))
> return va;
>
> /*
> @@ -1522,7 +1522,8 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
> */
> while ((node = rb_parent(node))) {
> va = rb_entry(node, struct vmap_area, rb_node);
> - if (is_within_this_va(va, size, align, vstart))
> + if (!arch_skip_va(va, vstart) &&
> + is_within_this_va(va, size, align, vstart))
> return va;
>
> if (get_subtree_max_size(node->rb_right) >= length &&
> @@ -1554,7 +1555,7 @@ find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
> struct vmap_area *va;
>
> list_for_each_entry(va, head, list) {
> - if (!is_within_this_va(va, size, align, vstart))
> + if (arch_skip_va(va, vstart) || !is_within_this_va(va, size, align, vstart))
> continue;
>
arch_skip_va() injections into the search algorithm sounds like a hack
and might lead(if i do not miss something, need to check closer) to alloc
failures when we go toward a reserved VA but we are not allowed to allocate
from.
> return va;
> @@ -1617,7 +1618,7 @@ classify_va_fit_type(struct vmap_area *va,
> return type;
> }
>
> -static __always_inline int
> +__always_inline int
> va_clip(struct rb_root *root, struct list_head *head,
> struct vmap_area *va, unsigned long nva_start_addr,
> unsigned long size)
> @@ -5129,4 +5130,7 @@ void __init vmalloc_init(void)
> vmap_node_shrinker->count_objects = vmap_node_shrink_count;
> vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
> shrinker_register(vmap_node_shrinker);
> +
> + arch_refine_vmap_space(&free_vmap_area_root, &free_vmap_area_list,
> + vmap_area_cachep);
> }
>
Why do not you allocate just using a specific range from MODULES_ASLR_START
till VMALLOC_END?
Thanks!
--
Uladzislau Rezki
On Thu, April 18, 2024 at 3:55 AM, Uladzislau Rezki wrote:
> On Tue, Apr 02, 2024 at 03:15:01PM -0500, Maxwell Bland wrote:
> > +extern void insert_vmap_area_augment(struct vmap_area *va, struct rb_node
> > +extern int va_clip(struct rb_root *root, struct list_head *head, +extern
> > struct vmap_area *__find_vmap_area(unsigned long addr,
> To me it looks like you want to make internal functions as public for
> everyone which is not good, imho.
First, thank you for the feedback. I tussled with some of these ideas too while
writing. I will clarify some motivations below and then propose some
alternatives based upon your review.
> arch_skip_va() injections into the search algorithm sounds like a hack and
> might lead(if i do not miss something, need to check closer) to alloc
> failures when we go toward a reserved VA but we are not allowed to allocate
> from.
This is a good insight into the architectural intention here. As is clear, the
underlying goal of this patch is to provide a method for architectures to
enforce their own pseudo-reserved vmalloc regions dynamically.
This considered, the highlighted potential failures would technically be
legitimate with the caveat of making architectures who implement the interface
responsible for maintaining only correct and appropriate reservations?
If so, then the path diverges conditioned on whether we believe that caveat is
reasonable. I am on the fence about whether freedom is good here, so I think it
is reasonable to disallow this freedom, see below.
> Why do not you allocate just using a specific range from MODULES_ASLR_START
> till VMALLOC_END?
Mark Rutland has indicated that he does not support a large free region size
reduction in favor of ensuring pages are not interleaved. That is, this was my
initial approach, but it was deemed unfit. Strict partitioning creates a
trade-off between region size and ASLR randomization.
To clarify a secondary point, in case this question was more general: allowing
interleaving between VMALLOC_START to VMALLOC_END and MODULES_ASLR_START to
MODULES_ASLR_END regions breaks a key usecase of being able to enforce new
PMD-level and coarse-grained protections (e.g. PXNTable) dynamically.
In case the question is more of a "why are you submitting this in the first
place": non-interleaving simplifies code focused on preventing malicious page
table updates since we do not need to track all updates of PTE level
descriptors. Verifying individual PTE updates comes at a high (performance,
complexity) cost and happens to lead to hardware-level privilege-checking race
conditions on certain very popular arm64 chipsets.
OK, preamble out of the way:
(1) Would it be OK to potentially export a more generic version of the
functions written in arch/arm64/kernel/vmalloc.c for
https://lore.kernel.org/all/[email protected]/
That is, move a version of these functions to the main vmalloc.c? This way
these functions are still owned by the right part of the kernel.
Or (2) the exported functions could be duplicated, effectively, into
architecture-specific code, a sort of "all in" to the caveat mentioned above of
making the architectures responsible for maintaining a reserved code region if
they choose to implement the interface.
(3) Potentially a different approach that does not involve skipping the
allocation of "bad" VA's but instead dynamically restructures the tree,
potentially just creating two trees, one for data and one for code, is in mind.
Thanks and Regards,
Maxwell Bland