2023-04-11 15:14:19

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 00/14] Backport of maple tree fixes for 6.1/6.2

Greg,

Here are the backports of the patches for the maple tree fixes from
6.3-rc6 for 6.1 and 6.2.

The patches differ with a few extra patches for the maple tree, and
changes to the mm code patch to work without the vma iterator change.

Liam R. Howlett (14):
maple_tree: remove GFP_ZERO from kmem_cache_alloc() and
kmem_cache_alloc_bulk()
maple_tree: fix potential rcu issue
maple_tree: reduce user error potential
maple_tree: fix handle of invalidated state in mas_wr_store_setup()
maple_tree: fix mas_prev() and mas_find() state handling
maple_tree: fix mas_skip_node() end slot detection
maple_tree: be more cautious about dead nodes
maple_tree: refine ma_state init from mas_start()
maple_tree: detect dead nodes in mas_start()
maple_tree: fix freeing of nodes in rcu mode
maple_tree: remove extra smp_wmb() from mas_dead_leaves()
maple_tree: add smp_rmb() to dead node detection
maple_tree: add RCU lock checking to rcu callback functions
mm: enable maple tree RCU mode by default.

include/linux/mm_types.h | 3 +-
kernel/fork.c | 3 +
lib/maple_tree.c | 389 ++++++++++++++++++++-----------
mm/mmap.c | 3 +-
tools/testing/radix-tree/maple.c | 18 +-
5 files changed, 263 insertions(+), 153 deletions(-)

--
2.39.2


2023-04-11 15:14:21

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 10/14] maple_tree: fix freeing of nodes in rcu mode

From: "Liam R. Howlett" <[email protected]>

commit 2e5b4921f8efc9e845f4f04741797d16f36847eb upstream.

The walk to destroy the nodes was not always setting the node type and
would result in a destroy method potentially using the values as nodes.
Avoid this by setting the correct node types. This is necessary for the
RCU mode of the maple tree.

Link: https://lkml.kernel.org/r/[email protected]
Cc: <[email protected]>
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam Howlett <[email protected]>
---
lib/maple_tree.c | 73 ++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 62 insertions(+), 11 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 6fcf08dbdbf9..0f0a2d4850e8 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -892,6 +892,44 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
meta->end = end;
}

+/*
+ * mas_clear_meta() - clear the metadata information of a node, if it exists
+ * @mas: The maple state
+ * @mn: The maple node
+ * @mt: The maple node type
+ * @offset: The offset of the highest sub-gap in this node.
+ * @end: The end of the data in this node.
+ */
+static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn,
+ enum maple_type mt)
+{
+ struct maple_metadata *meta;
+ unsigned long *pivots;
+ void __rcu **slots;
+ void *next;
+
+ switch (mt) {
+ case maple_range_64:
+ pivots = mn->mr64.pivot;
+ if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
+ slots = mn->mr64.slot;
+ next = mas_slot_locked(mas, slots,
+ MAPLE_RANGE64_SLOTS - 1);
+ if (unlikely((mte_to_node(next) && mte_node_type(next))))
+ return; /* The last slot is a node, no metadata */
+ }
+ fallthrough;
+ case maple_arange_64:
+ meta = ma_meta(mn, mt);
+ break;
+ default:
+ return;
+ }
+
+ meta->gap = 0;
+ meta->end = 0;
+}
+
/*
* ma_meta_end() - Get the data end of a node from the metadata
* @mn: The maple node
@@ -5439,20 +5477,22 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min,
* mas_dead_leaves() - Mark all leaves of a node as dead.
* @mas: The maple state
* @slots: Pointer to the slot array
+ * @type: The maple node type
*
* Must hold the write lock.
*
* Return: The number of leaves marked as dead.
*/
static inline
-unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
+unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
+ enum maple_type mt)
{
struct maple_node *node;
enum maple_type type;
void *entry;
int offset;

- for (offset = 0; offset < mt_slot_count(mas->node); offset++) {
+ for (offset = 0; offset < mt_slots[mt]; offset++) {
entry = mas_slot_locked(mas, slots, offset);
type = mte_node_type(entry);
node = mte_to_node(entry);
@@ -5471,14 +5511,13 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)

static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
{
- struct maple_node *node, *next;
+ struct maple_node *next;
void __rcu **slots = NULL;

next = mas_mn(mas);
do {
- mas->node = ma_enode_ptr(next);
- node = mas_mn(mas);
- slots = ma_slots(node, node->type);
+ mas->node = mt_mk_node(next, next->type);
+ slots = ma_slots(next, next->type);
next = mas_slot_locked(mas, slots, offset);
offset = 0;
} while (!ma_is_leaf(next->type));
@@ -5542,11 +5581,14 @@ static inline void __rcu **mas_destroy_descend(struct ma_state *mas,
node = mas_mn(mas);
slots = ma_slots(node, mte_node_type(mas->node));
next = mas_slot_locked(mas, slots, 0);
- if ((mte_dead_node(next)))
+ if ((mte_dead_node(next))) {
+ mte_to_node(next)->type = mte_node_type(next);
next = mas_slot_locked(mas, slots, 1);
+ }

mte_set_node_dead(mas->node);
node->type = mte_node_type(mas->node);
+ mas_clear_meta(mas, node, node->type);
node->piv_parent = prev;
node->parent_slot = offset;
offset = 0;
@@ -5566,13 +5608,18 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,

MA_STATE(mas, &mt, 0, 0);

- if (mte_is_leaf(enode))
+ mas.node = enode;
+ if (mte_is_leaf(enode)) {
+ node->type = mte_node_type(enode);
goto free_leaf;
+ }

+ ma_flags &= ~MT_FLAGS_LOCK_MASK;
mt_init_flags(&mt, ma_flags);
mas_lock(&mas);

- mas.node = start = enode;
+ mte_to_node(enode)->ma_flags = ma_flags;
+ start = enode;
slots = mas_destroy_descend(&mas, start, 0);
node = mas_mn(&mas);
do {
@@ -5580,7 +5627,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
unsigned char offset;
struct maple_enode *parent, *tmp;

- node->slot_len = mas_dead_leaves(&mas, slots);
+ node->type = mte_node_type(mas.node);
+ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
if (free)
mt_free_bulk(node->slot_len, slots);
offset = node->parent_slot + 1;
@@ -5604,7 +5652,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
} while (start != mas.node);

node = mas_mn(&mas);
- node->slot_len = mas_dead_leaves(&mas, slots);
+ node->type = mte_node_type(mas.node);
+ node->slot_len = mas_dead_leaves(&mas, slots, node->type);
if (free)
mt_free_bulk(node->slot_len, slots);

@@ -5614,6 +5663,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
free_leaf:
if (free)
mt_free_rcu(&node->rcu);
+ else
+ mas_clear_meta(&mas, node, node->type);
}

/*
--
2.39.2

2023-04-11 15:14:25

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 05/14] maple_tree: fix mas_prev() and mas_find() state handling

From: "Liam R. Howlett" <[email protected]>

commit 17dc622c7b0f94e49bed030726df4db12ecaa6b5 upstream.

When mas_prev() does not find anything, set the state to MAS_NONE.

Handle the MAS_NONE in mas_find() like a MAS_START.

Link: https://lkml.kernel.org/r/[email protected]
Cc: <[email protected]>
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <[email protected]>
Reported-by: <[email protected]>
---
lib/maple_tree.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 50604fecd476..fc3e22cff642 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -4850,7 +4850,7 @@ static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min)

if (mas->index < min) {
mas->index = mas->last = min;
- mas_pause(mas);
+ mas->node = MAS_NONE;
return NULL;
}
retry:
@@ -5926,6 +5926,7 @@ void *mas_prev(struct ma_state *mas, unsigned long min)
if (!mas->index) {
/* Nothing comes before 0 */
mas->last = 0;
+ mas->node = MAS_NONE;
return NULL;
}

@@ -6016,6 +6017,9 @@ void *mas_find(struct ma_state *mas, unsigned long max)
mas->index = ++mas->last;
}

+ if (unlikely(mas_is_none(mas)))
+ mas->node = MAS_START;
+
if (unlikely(mas_is_start(mas))) {
/* First run or continue */
void *entry;
--
2.39.2

2023-04-11 15:14:36

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 11/14] maple_tree: remove extra smp_wmb() from mas_dead_leaves()

From: "Liam R. Howlett" <[email protected]>

commit 8372f4d83f96f35915106093cde4565836587123 upstream.

The call to mte_set_dead_node() before the smp_wmb() already calls
smp_wmb() so this is not needed. This is an optimization for the RCU mode
of the maple tree.

Link: https://lkml.kernel.org/r/[email protected]
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Cc: [email protected]
Signed-off-by: Liam Howlett <[email protected]>
---
lib/maple_tree.c | 1 -
1 file changed, 1 deletion(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 0f0a2d4850e8..281be0997e55 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5501,7 +5501,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
break;

mte_set_node_dead(entry);
- smp_wmb(); /* Needed for RCU */
node->type = type;
rcu_assign_pointer(slots[offset], node);
}
--
2.39.2

2023-04-11 15:14:47

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 13/14] maple_tree: add RCU lock checking to rcu callback functions

From: "Liam R. Howlett" <[email protected]>

commit 790e1fa86b340c2bd4a327e01c161f7a1ad885f6 upstream.

Dereferencing RCU objects within the RCU callback without the RCU check
has caused lockdep to complain. Fix the RCU dereferencing by using the
RCU callback lock to ensure the operation is safe.

Also stop creating a new lock to use for dereferencing during destruction
of the tree or subtree. Instead, pass through a pointer to the tree that
has the lock that is held for RCU dereferencing checking. It also does
not make sense to use the maple state in the freeing scenario as the tree
walk is a special case where the tree no longer has the normal encodings
and parent pointers.

Link: https://lkml.kernel.org/r/[email protected]
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Cc: [email protected]
Reported-by: Suren Baghdasaryan <[email protected]>
Signed-off-by: Liam R. Howlett <[email protected]>
---
lib/maple_tree.c | 188 ++++++++++++++++++++++++-----------------------
1 file changed, 96 insertions(+), 92 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 2f9af64edad9..b6e29081d2cc 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -814,6 +814,11 @@ static inline void *mt_slot(const struct maple_tree *mt,
return rcu_dereference_check(slots[offset], mt_locked(mt));
}

+static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots,
+ unsigned char offset)
+{
+ return rcu_dereference_protected(slots[offset], mt_locked(mt));
+}
/*
* mas_slot_locked() - Get the slot value when holding the maple tree lock.
* @mas: The maple state
@@ -825,7 +830,7 @@ static inline void *mt_slot(const struct maple_tree *mt,
static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots,
unsigned char offset)
{
- return rcu_dereference_protected(slots[offset], mt_locked(mas->tree));
+ return mt_slot_locked(mas->tree, slots, offset);
}

/*
@@ -897,34 +902,35 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
}

/*
- * mas_clear_meta() - clear the metadata information of a node, if it exists
- * @mas: The maple state
+ * mt_clear_meta() - clear the metadata information of a node, if it exists
+ * @mt: The maple tree
* @mn: The maple node
- * @mt: The maple node type
+ * @type: The maple node type
* @offset: The offset of the highest sub-gap in this node.
* @end: The end of the data in this node.
*/
-static inline void mas_clear_meta(struct ma_state *mas, struct maple_node *mn,
- enum maple_type mt)
+static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
+ enum maple_type type)
{
struct maple_metadata *meta;
unsigned long *pivots;
void __rcu **slots;
void *next;

- switch (mt) {
+ switch (type) {
case maple_range_64:
pivots = mn->mr64.pivot;
if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
slots = mn->mr64.slot;
- next = mas_slot_locked(mas, slots,
- MAPLE_RANGE64_SLOTS - 1);
- if (unlikely((mte_to_node(next) && mte_node_type(next))))
- return; /* The last slot is a node, no metadata */
+ next = mt_slot_locked(mt, slots,
+ MAPLE_RANGE64_SLOTS - 1);
+ if (unlikely((mte_to_node(next) &&
+ mte_node_type(next))))
+ return; /* no metadata, could be node */
}
fallthrough;
case maple_arange_64:
- meta = ma_meta(mn, mt);
+ meta = ma_meta(mn, type);
break;
default:
return;
@@ -5478,7 +5484,7 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min,
}

/*
- * mas_dead_leaves() - Mark all leaves of a node as dead.
+ * mte_dead_leaves() - Mark all leaves of a node as dead.
* @mas: The maple state
* @slots: Pointer to the slot array
* @type: The maple node type
@@ -5488,16 +5494,16 @@ static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min,
* Return: The number of leaves marked as dead.
*/
static inline
-unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
- enum maple_type mt)
+unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
+ void __rcu **slots)
{
struct maple_node *node;
enum maple_type type;
void *entry;
int offset;

- for (offset = 0; offset < mt_slots[mt]; offset++) {
- entry = mas_slot_locked(mas, slots, offset);
+ for (offset = 0; offset < mt_slot_count(enode); offset++) {
+ entry = mt_slot(mt, slots, offset);
type = mte_node_type(entry);
node = mte_to_node(entry);
/* Use both node and type to catch LE & BE metadata */
@@ -5512,162 +5518,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots,
return offset;
}

-static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
+/**
+ * mte_dead_walk() - Walk down a dead tree to just before the leaves
+ * @enode: The maple encoded node
+ * @offset: The starting offset
+ *
+ * Note: This can only be used from the RCU callback context.
+ */
+static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
- struct maple_node *next;
+ struct maple_node *node, *next;
void __rcu **slots = NULL;

- next = mas_mn(mas);
+ next = mte_to_node(*enode);
do {
- mas->node = mt_mk_node(next, next->type);
- slots = ma_slots(next, next->type);
- next = mas_slot_locked(mas, slots, offset);
+ *enode = ma_enode_ptr(next);
+ node = mte_to_node(*enode);
+ slots = ma_slots(node, node->type);
+ next = rcu_dereference_protected(slots[offset],
+ lock_is_held(&rcu_callback_map));
offset = 0;
} while (!ma_is_leaf(next->type));

return slots;
}

+/**
+ * mt_free_walk() - Walk & free a tree in the RCU callback context
+ * @head: The RCU head that's within the node.
+ *
+ * Note: This can only be used from the RCU callback context.
+ */
static void mt_free_walk(struct rcu_head *head)
{
void __rcu **slots;
struct maple_node *node, *start;
- struct maple_tree mt;
+ struct maple_enode *enode;
unsigned char offset;
enum maple_type type;
- MA_STATE(mas, &mt, 0, 0);

node = container_of(head, struct maple_node, rcu);

if (ma_is_leaf(node->type))
goto free_leaf;

- mt_init_flags(&mt, node->ma_flags);
- mas_lock(&mas);
start = node;
- mas.node = mt_mk_node(node, node->type);
- slots = mas_dead_walk(&mas, 0);
- node = mas_mn(&mas);
+ enode = mt_mk_node(node, node->type);
+ slots = mte_dead_walk(&enode, 0);
+ node = mte_to_node(enode);
do {
mt_free_bulk(node->slot_len, slots);
offset = node->parent_slot + 1;
- mas.node = node->piv_parent;
- if (mas_mn(&mas) == node)
- goto start_slots_free;
-
- type = mte_node_type(mas.node);
- slots = ma_slots(mte_to_node(mas.node), type);
- if ((offset < mt_slots[type]) && (slots[offset]))
- slots = mas_dead_walk(&mas, offset);
-
- node = mas_mn(&mas);
+ enode = node->piv_parent;
+ if (mte_to_node(enode) == node)
+ goto free_leaf;
+
+ type = mte_node_type(enode);
+ slots = ma_slots(mte_to_node(enode), type);
+ if ((offset < mt_slots[type]) &&
+ rcu_dereference_protected(slots[offset],
+ lock_is_held(&rcu_callback_map)))
+ slots = mte_dead_walk(&enode, offset);
+ node = mte_to_node(enode);
} while ((node != start) || (node->slot_len < offset));

slots = ma_slots(node, node->type);
mt_free_bulk(node->slot_len, slots);

-start_slots_free:
- mas_unlock(&mas);
free_leaf:
mt_free_rcu(&node->rcu);
}

-static inline void __rcu **mas_destroy_descend(struct ma_state *mas,
- struct maple_enode *prev, unsigned char offset)
+static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
+ struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
struct maple_node *node;
- struct maple_enode *next = mas->node;
+ struct maple_enode *next = *enode;
void __rcu **slots = NULL;
+ enum maple_type type;
+ unsigned char next_offset = 0;

do {
- mas->node = next;
- node = mas_mn(mas);
- slots = ma_slots(node, mte_node_type(mas->node));
- next = mas_slot_locked(mas, slots, 0);
- if ((mte_dead_node(next))) {
- mte_to_node(next)->type = mte_node_type(next);
- next = mas_slot_locked(mas, slots, 1);
- }
+ *enode = next;
+ node = mte_to_node(*enode);
+ type = mte_node_type(*enode);
+ slots = ma_slots(node, type);
+ next = mt_slot_locked(mt, slots, next_offset);
+ if ((mte_dead_node(next)))
+ next = mt_slot_locked(mt, slots, ++next_offset);

- mte_set_node_dead(mas->node);
- node->type = mte_node_type(mas->node);
- mas_clear_meta(mas, node, node->type);
+ mte_set_node_dead(*enode);
+ node->type = type;
node->piv_parent = prev;
node->parent_slot = offset;
- offset = 0;
- prev = mas->node;
+ offset = next_offset;
+ next_offset = 0;
+ prev = *enode;
} while (!mte_is_leaf(next));

return slots;
}

-static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
+static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
bool free)
{
void __rcu **slots;
struct maple_node *node = mte_to_node(enode);
struct maple_enode *start;
- struct maple_tree mt;
-
- MA_STATE(mas, &mt, 0, 0);

- mas.node = enode;
if (mte_is_leaf(enode)) {
node->type = mte_node_type(enode);
goto free_leaf;
}

- ma_flags &= ~MT_FLAGS_LOCK_MASK;
- mt_init_flags(&mt, ma_flags);
- mas_lock(&mas);
-
- mte_to_node(enode)->ma_flags = ma_flags;
start = enode;
- slots = mas_destroy_descend(&mas, start, 0);
- node = mas_mn(&mas);
+ slots = mte_destroy_descend(&enode, mt, start, 0);
+ node = mte_to_node(enode); // Updated in the above call.
do {
enum maple_type type;
unsigned char offset;
struct maple_enode *parent, *tmp;

- node->type = mte_node_type(mas.node);
- node->slot_len = mas_dead_leaves(&mas, slots, node->type);
+ node->slot_len = mte_dead_leaves(enode, mt, slots);
if (free)
mt_free_bulk(node->slot_len, slots);
offset = node->parent_slot + 1;
- mas.node = node->piv_parent;
- if (mas_mn(&mas) == node)
- goto start_slots_free;
+ enode = node->piv_parent;
+ if (mte_to_node(enode) == node)
+ goto free_leaf;

- type = mte_node_type(mas.node);
- slots = ma_slots(mte_to_node(mas.node), type);
+ type = mte_node_type(enode);
+ slots = ma_slots(mte_to_node(enode), type);
if (offset >= mt_slots[type])
goto next;

- tmp = mas_slot_locked(&mas, slots, offset);
+ tmp = mt_slot_locked(mt, slots, offset);
if (mte_node_type(tmp) && mte_to_node(tmp)) {
- parent = mas.node;
- mas.node = tmp;
- slots = mas_destroy_descend(&mas, parent, offset);
+ parent = enode;
+ enode = tmp;
+ slots = mte_destroy_descend(&enode, mt, parent, offset);
}
next:
- node = mas_mn(&mas);
- } while (start != mas.node);
+ node = mte_to_node(enode);
+ } while (start != enode);

- node = mas_mn(&mas);
- node->type = mte_node_type(mas.node);
- node->slot_len = mas_dead_leaves(&mas, slots, node->type);
+ node = mte_to_node(enode);
+ node->slot_len = mte_dead_leaves(enode, mt, slots);
if (free)
mt_free_bulk(node->slot_len, slots);

-start_slots_free:
- mas_unlock(&mas);
-
free_leaf:
if (free)
mt_free_rcu(&node->rcu);
else
- mas_clear_meta(&mas, node, node->type);
+ mt_clear_meta(mt, node, node->type);
}

/*
@@ -5683,10 +5687,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
struct maple_node *node = mte_to_node(enode);

if (mt_in_rcu(mt)) {
- mt_destroy_walk(enode, mt->ma_flags, false);
+ mt_destroy_walk(enode, mt, false);
call_rcu(&node->rcu, mt_free_walk);
} else {
- mt_destroy_walk(enode, mt->ma_flags, true);
+ mt_destroy_walk(enode, mt, true);
}
}

--
2.39.2

2023-04-11 15:15:12

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 14/14] mm: enable maple tree RCU mode by default.

From: "Liam R. Howlett" <[email protected]>

commit 3dd4432549415f3c65dd52d5c687629efbf4ece1 upstream.

Use the maple tree in RCU mode for VMA tracking.

The maple tree tracks the stack and is able to update the pivot
(lower/upper boundary) in-place to allow the page fault handler to write
to the tree while holding just the mmap read lock. This is safe as the
writes to the stack have a guard VMA which ensures there will always be
a NULL in the direction of the growth and thus will only update a pivot.

It is possible, but not recommended, to have VMAs that grow up/down
without guard VMAs. syzbot has constructed a testcase which sets up a
VMA to grow and consume the empty space. Overwriting the entire NULL
entry causes the tree to be altered in a way that is not safe for
concurrent readers; the readers may see a node being rewritten or one
that does not match the maple state they are using.

Enabling RCU mode allows the concurrent readers to see a stable node and
will return the expected result.

Link: https://lkml.kernel.org/r/[email protected]
Cc: [email protected]
Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree")
Signed-off-by: Liam R. Howlett <[email protected]>
Reported-by: [email protected]
---
include/linux/mm_types.h | 3 ++-
kernel/fork.c | 3 +++
mm/mmap.c | 3 ++-
3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..247aedb18d5c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -725,7 +725,8 @@ struct mm_struct {
unsigned long cpu_bitmap[];
};

-#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
+#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
+ MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;

/* Pointer magic because the dynamic array size confuses some compilers. */
diff --git a/kernel/fork.c b/kernel/fork.c
index a6d243a50be3..ec913b13c5ed 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;

+ mt_clear_in_rcu(mas.tree);
mas_for_each(&old_mas, mpnt, ULONG_MAX) {
struct file *file;

@@ -703,6 +704,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
mas_destroy(&mas);
+ if (!retval)
+ mt_set_in_rcu(mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
diff --git a/mm/mmap.c b/mm/mmap.c
index 177714886849..fe1db604dc49 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2308,7 +2308,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
int count = 0;
int error = -ENOMEM;
MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_init_flags(&mt_detach, mas->tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);

if (mas_preallocate(mas, vma, GFP_KERNEL))
@@ -3095,6 +3095,7 @@ void exit_mmap(struct mm_struct *mm)
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
+ mt_clear_in_rcu(&mm->mm_mt);
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
--
2.39.2

2023-04-11 15:15:28

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 06/14] maple_tree: fix mas_skip_node() end slot detection

From: "Liam R. Howlett" <[email protected]>

commit 0fa99fdfe1b38da396d0b2d1496a823bcd0ebea0 upstream.

mas_skip_node() is used to move the maple state to the node with a higher
limit. It does this by walking up the tree and increasing the slot count.
Since slot count may not be able to be increased, it may need to walk up
multiple times to find room to walk right to a higher limit node. The
limit of slots that was being used was the node limit and not the last
location of data in the node. This would cause the maple state to be
shifted outside actual data and enter an error state, thus returning
-EBUSY.

The result of the incorrect error state means that mas_awalk() would
return an error instead of finding the allocation space.

The fix is to use mas_data_end() in mas_skip_node() to detect the nodes
data end point and continue walking the tree up until it is safe to move
to a node with a higher limit.

The walk up the tree also sets the maple state limits so remove the buggy
code from mas_skip_node(). Setting the limits had the unfortunate side
effect of triggering another bug if the parent node was full and the there
was no suitable gap in the second last child, but room in the next child.

mas_skip_node() may also be passed a maple state in an error state from
mas_anode_descend() when no allocations are available. Return on such an
error state immediately.

Link: https://lkml.kernel.org/r/[email protected]
Link: https://lkml.kernel.org/r/[email protected]
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Signed-off-by: Liam R. Howlett <[email protected]>
Reported-by: Snild Dolkow <[email protected]>
Link: https://lore.kernel.org/linux-mm/[email protected]/
Tested-by: Snild Dolkow <[email protected]>
Cc: Peng Zhang <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---
lib/maple_tree.c | 24 +++++-------------------
1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index fc3e22cff642..c50646fcb8ca 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -5097,35 +5097,21 @@ static inline bool mas_rewind_node(struct ma_state *mas)
*/
static inline bool mas_skip_node(struct ma_state *mas)
{
- unsigned char slot, slot_count;
- unsigned long *pivots;
- enum maple_type mt;
+ if (mas_is_err(mas))
+ return false;

- mt = mte_node_type(mas->node);
- slot_count = mt_slots[mt] - 1;
do {
if (mte_is_root(mas->node)) {
- slot = mas->offset;
- if (slot > slot_count) {
+ if (mas->offset >= mas_data_end(mas)) {
mas_set_err(mas, -EBUSY);
return false;
}
} else {
mas_ascend(mas);
- slot = mas->offset;
- mt = mte_node_type(mas->node);
- slot_count = mt_slots[mt] - 1;
}
- } while (slot > slot_count);
-
- mas->offset = ++slot;
- pivots = ma_pivots(mas_mn(mas), mt);
- if (slot > 0)
- mas->min = pivots[slot - 1] + 1;
-
- if (slot <= slot_count)
- mas->max = pivots[slot];
+ } while (mas->offset >= mas_data_end(mas));

+ mas->offset++;
return true;
}

--
2.39.2

2023-04-11 15:15:57

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH 6.1 12/14] maple_tree: add smp_rmb() to dead node detection

From: "Liam R. Howlett" <[email protected]>

commit 0a2b18d948838e16912b3b627b504ab062b7d02a upstream.

Add an smp_rmb() before reading the parent pointer to ensure that anything
read from the node prior to the parent pointer hasn't been reordered ahead
of this check.

The is necessary for RCU mode.

Link: https://lkml.kernel.org/r/[email protected]
Fixes: 54a611b60590 ("Maple Tree: add new data structure")
Cc: [email protected]
Signed-off-by: Liam R. Howlett <[email protected]>
---
lib/maple_tree.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 281be0997e55..2f9af64edad9 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -529,9 +529,11 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode)
*/
static inline bool ma_dead_node(const struct maple_node *node)
{
- struct maple_node *parent = (void *)((unsigned long)
- node->parent & ~MAPLE_NODE_MASK);
+ struct maple_node *parent;

+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
+ parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
return (parent == node);
}

@@ -546,6 +548,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode)
struct maple_node *parent, *node;

node = mte_to_node(enode);
+ /* Do not reorder reads from the node prior to the parent check */
+ smp_rmb();
parent = mte_parent(enode);
return (parent == node);
}
--
2.39.2

2023-04-12 08:14:43

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH 6.1 00/14] Backport of maple tree fixes for 6.1/6.2

On Tue, Apr 11, 2023 at 11:10:41AM -0400, Liam R. Howlett wrote:
> Greg,
>
> Here are the backports of the patches for the maple tree fixes from
> 6.3-rc6 for 6.1 and 6.2.
>
> The patches differ with a few extra patches for the maple tree, and
> changes to the mm code patch to work without the vma iterator change.
>
> Liam R. Howlett (14):
> maple_tree: remove GFP_ZERO from kmem_cache_alloc() and
> kmem_cache_alloc_bulk()
> maple_tree: fix potential rcu issue
> maple_tree: reduce user error potential
> maple_tree: fix handle of invalidated state in mas_wr_store_setup()
> maple_tree: fix mas_prev() and mas_find() state handling
> maple_tree: fix mas_skip_node() end slot detection
> maple_tree: be more cautious about dead nodes
> maple_tree: refine ma_state init from mas_start()
> maple_tree: detect dead nodes in mas_start()
> maple_tree: fix freeing of nodes in rcu mode
> maple_tree: remove extra smp_wmb() from mas_dead_leaves()
> maple_tree: add smp_rmb() to dead node detection
> maple_tree: add RCU lock checking to rcu callback functions
> mm: enable maple tree RCU mode by default.
>
> include/linux/mm_types.h | 3 +-
> kernel/fork.c | 3 +
> lib/maple_tree.c | 389 ++++++++++++++++++++-----------
> mm/mmap.c | 3 +-
> tools/testing/radix-tree/maple.c | 18 +-
> 5 files changed, 263 insertions(+), 153 deletions(-)
>
> --
> 2.39.2
>

Thanks for these. One of them was already in the 6.2.y and 6.1.y
releases, so I just skipped it.

greg k-h