This is another revision of the radix tree / workingset patches based
on feedback from Jan and Kirill. Thanks for your input.
This is a follow-up to d3798ae8c6f3 ("mm: filemap: don't plant shadow
entries without radix tree node"). That patch fixed an issue that was
caused mainly by the page cache sneaking special shadow page entries
into the radix tree and relying on subtleties in the radix tree code
to make that work. The fix also had to stop tracking refaults for
single-page files because shadow pages stored as direct pointers in
radix_tree_root->rnode weren't properly handled during tree extension.
These patches make the radix tree code explicitely support and track
such special entries, to eliminate the subtleties and to restore the
thrash detection for single-page files.
Changes since v2:
- Shadow entry accounting and radix tree node tracking are fully gone
from the page cache code, making it much simpler and robust. Counts
are kept natively in the radix tree, node tracking is done from one
simple callback function in the workingset code. Thanks Jan.
- One more radix tree fix in khugepaged's new shmem collapsing code.
Thanks Kirill and Jan.
arch/s390/mm/gmap.c | 2 +-
drivers/sh/intc/virq.c | 2 +-
fs/dax.c | 10 +-
include/linux/radix-tree.h | 34 ++--
include/linux/swap.h | 34 +---
lib/radix-tree.c | 297 ++++++++++++++++++++------------
mm/filemap.c | 63 +------
mm/khugepaged.c | 16 +-
mm/migrate.c | 4 +-
mm/shmem.c | 9 +-
mm/truncate.c | 21 +--
mm/workingset.c | 70 ++++++--
tools/testing/radix-tree/multiorder.c | 2 +-
13 files changed, 292 insertions(+), 272 deletions(-)
When the shadow page shrinker tries to reclaim a radix tree node but
finds it in an unexpected state - it should contain no pages, and
non-zero shadow entries - there is no need to kill the executing task
or even the entire system. Warn about the invalid state, then leave
that tree node be. Simply don't put it back on the shadow LRU for
future reclaim and move on.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/workingset.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/mm/workingset.c b/mm/workingset.c
index 617475f529f4..3cfc61d84a52 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -418,23 +418,27 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- BUG_ON(!workingset_node_shadows(node));
- BUG_ON(workingset_node_pages(node));
-
+ if (WARN_ON_ONCE(!workingset_node_shadows(node)))
+ goto out_invalid;
+ if (WARN_ON_ONCE(workingset_node_pages(node)))
+ goto out_invalid;
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
if (node->slots[i]) {
- BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
+ goto out_invalid;
+ if (WARN_ON_ONCE(!mapping->nrexceptional))
+ goto out_invalid;
node->slots[i] = NULL;
workingset_node_shadows_dec(node);
- BUG_ON(!mapping->nrexceptional);
mapping->nrexceptional--;
}
}
- BUG_ON(workingset_node_shadows(node));
+ if (WARN_ON_ONCE(workingset_node_shadows(node)))
+ goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
- if (!__radix_tree_delete_node(&mapping->page_tree, node))
- BUG();
+ __radix_tree_delete_node(&mapping->page_tree, node);
+out_invalid:
spin_unlock(&mapping->tree_lock);
ret = LRU_REMOVED_RETRY;
out:
--
2.10.2
When a radix tree iteration drops the tree lock, another thread might
swoop in and free the node holding the current slot. The iteration
needs to do another tree lookup from the current index to continue.
[[email protected]: re-lookup for replacement]
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/khugepaged.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 728d7790dc2d..bdfdab40a813 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1401,6 +1401,9 @@ static void collapse_shmem(struct mm_struct *mm,
spin_lock_irq(&mapping->tree_lock);
+ slot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
@@ -1424,6 +1427,7 @@ static void collapse_shmem(struct mm_struct *mm,
radix_tree_replace_slot(slot,
new_page + (index % HPAGE_PMD_NR));
+ slot = radix_tree_iter_next(&iter);
index++;
continue;
out_lru:
@@ -1535,6 +1539,7 @@ static void collapse_shmem(struct mm_struct *mm,
putback_lru_page(page);
unlock_page(page);
spin_lock_irq(&mapping->tree_lock);
+ slot = radix_tree_iter_next(&iter);
}
VM_BUG_ON(nr_none);
spin_unlock_irq(&mapping->tree_lock);
--
2.10.2
The radix tree counts valid entries in each tree node. Entries stored
in the tree cannot be removed by simpling storing NULL in the slot or
the internal counters will be off and the node never gets freed again.
When collapsing a shmem page fails, restore the holes that were filled
with radix_tree_insert() with a proper radix tree deletion.
Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
Reported-by: Jan Kara <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/khugepaged.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index bdfdab40a813..d553c294de40 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1523,9 +1523,11 @@ static void collapse_shmem(struct mm_struct *mm,
if (!page || iter.index < page->index) {
if (!nr_none)
break;
- /* Put holes back where they were */
- radix_tree_replace_slot(slot, NULL);
nr_none--;
+ /* Put holes back where they were */
+ radix_tree_delete(&mapping->page_tree,
+ iter.index);
+ slot = radix_tree_iter_next(&iter);
continue;
}
--
2.10.2
The way the page cache is sneaking shadow entries of evicted pages
into the radix tree past the node entry accounting and tracking them
manually in the upper bits of node->count is fraught with problems.
These shadow entries are marked in the tree as exceptional entries,
which are a native concept to the radix tree. Maintain an explicit
counter of exceptional entries in the radix tree node. Subsequent
patches will switch shadow entry tracking over to that counter.
DAX and shmem are the other users of exceptional entries. Since slot
replacements that change the entry type from regular to exceptional
must now be accounted, introduce a __radix_tree_replace() function
that does replacement and accounting, and switch DAX and shmem over.
The increase in radix tree node size is temporary. A followup patch
switches the shadow tracking to this new scheme and we'll no longer
need the upper bits in node->count and shrink that back to one byte.
Signed-off-by: Johannes Weiner <[email protected]>
---
fs/dax.c | 5 +++--
include/linux/radix-tree.h | 10 +++++++---
lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
mm/shmem.c | 8 ++++----
4 files changed, 57 insertions(+), 12 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..db78bae0dc0f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -643,12 +643,13 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
}
mapping->nrexceptional++;
} else {
+ struct radix_tree_node *node;
void **slot;
void *ret;
- ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+ ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- radix_tree_replace_slot(slot, new_entry);
+ __radix_tree_replace(page_tree, node, slot, new_entry);
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index af3581b8a451..7ced8a70cc8b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -85,9 +85,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
struct radix_tree_node {
- unsigned char shift; /* Bits remaining in each slot */
- unsigned char offset; /* Slot offset in parent */
- unsigned int count;
+ unsigned char shift; /* Bits remaining in each slot */
+ unsigned char offset; /* Slot offset in parent */
+ unsigned int count; /* Total entry count */
+ unsigned char exceptional; /* Exceptional entry count */
union {
struct {
/* Used when ascending tree */
@@ -276,6 +277,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void __radix_tree_replace(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item);
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e6d552c40dd..7885796d35ae 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
{
unsigned long i;
- pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
+ pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
node, node->offset,
node->tags[0][0], node->tags[1][0], node->tags[2][0],
- node->shift, node->count, node->parent);
+ node->shift, node->count, node->exceptional, node->parent);
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
unsigned long first = index | (i << node->shift);
@@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
node->offset = 0;
node->count = 1;
node->parent = NULL;
- if (radix_tree_is_internal_node(slot))
+ if (radix_tree_is_internal_node(slot)) {
entry_to_node(slot)->parent = node;
+ } else {
+ /* Moving an exceptional root->rnode to a node */
+ if (radix_tree_exceptional_entry(slot))
+ node->exceptional = 1;
+ }
node->slots[0] = slot;
slot = node_to_entry(node);
rcu_assign_pointer(root->rnode, slot);
@@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
if (node) {
unsigned offset = get_slot_offset(node, slot);
node->count++;
+ if (radix_tree_exceptional_entry(item))
+ node->exceptional++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
BUG_ON(tag_get(node, 2, offset));
@@ -747,6 +754,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
EXPORT_SYMBOL(radix_tree_lookup);
/**
+ * __radix_tree_replace - replace item in a slot
+ * @root: radix tree root
+ * @node: pointer to tree node
+ * @slot: pointer to slot in @node
+ * @item: new item to store in the slot.
+ *
+ * For use with __radix_tree_lookup(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item)
+{
+ void *old = rcu_dereference_raw(*slot);
+ int exceptional;
+
+ WARN_ON_ONCE(radix_tree_is_internal_node(item));
+ WARN_ON_ONCE(!!item - !!old);
+
+ exceptional = !!radix_tree_exceptional_entry(item) -
+ !!radix_tree_exceptional_entry(old);
+
+ WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode);
+
+ if (node)
+ node->exceptional += exceptional;
+
+ rcu_assign_pointer(*slot, item);
+}
+
+/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
@@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
delete_sibling_entries(node, node_to_entry(slot), offset);
node->slots[offset] = NULL;
node->count--;
+ if (radix_tree_exceptional_entry(entry))
+ node->exceptional--;
__radix_tree_delete_node(root, node);
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..7f3a08df25c9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -300,18 +300,18 @@ void shmem_uncharge(struct inode *inode, long pages)
static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
+ struct radix_tree_node *node;
void **pslot;
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (!pslot)
+ item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
+ if (!item)
return -ENOENT;
- item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- radix_tree_replace_slot(pslot, replacement);
+ __radix_tree_replace(&mapping->page_tree, node, pslot, replacement);
return 0;
}
--
2.10.2
The bug in khugepaged fixed earlier in this series shows that radix
tree slot replacement is fragile; and it will become more so when not
only NULL<->!NULL transitions need to be caught but transitions from
and to exceptional entries as well. We need checks.
Re-implement radix_tree_replace_slot() on top of the sanity-checked
__radix_tree_replace(). This requires existing callers to also pass
the radix tree root, but it'll warn us when somebody replaces slots
with contents that need proper accounting (transitions between NULL
entries, real entries, exceptional entries) and where a replacement
through the slot pointer would corrupt the radix tree node counts.
Suggested-by: Jan Kara <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
arch/s390/mm/gmap.c | 2 +-
drivers/sh/intc/virq.c | 2 +-
fs/dax.c | 4 +--
include/linux/radix-tree.h | 16 ++-------
lib/radix-tree.c | 63 +++++++++++++++++++++++++++--------
mm/filemap.c | 4 +--
mm/khugepaged.c | 5 +--
mm/migrate.c | 4 +--
mm/truncate.c | 2 +-
tools/testing/radix-tree/multiorder.c | 2 +-
10 files changed, 64 insertions(+), 40 deletions(-)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3ba622702ce4..ec1f0dedb948 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
- radix_tree_replace_slot(slot, rmap);
+ radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c
index e7899624aa0b..35bbe288ddb4 100644
--- a/drivers/sh/intc/virq.c
+++ b/drivers/sh/intc/virq.c
@@ -254,7 +254,7 @@ static void __init intc_subgroup_map(struct intc_desc_int *d)
radix_tree_tag_clear(&d->tree, entry->enum_id,
INTC_TAG_VIRQ_NEEDS_ALLOC);
- radix_tree_replace_slot((void **)entries[i],
+ radix_tree_replace_slot(&d->tree, (void **)entries[i],
&intc_irq_xlate[irq]);
}
diff --git a/fs/dax.c b/fs/dax.c
index db78bae0dc0f..85930c2a2749 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry |= RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
@@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
- radix_tree_replace_slot(slot, (void *)entry);
+ radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
return (void *)entry;
}
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 7ced8a70cc8b..2d1b9b8be983 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg)
return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}
-/**
- * radix_tree_replace_slot - replace item in a slot
- * @pslot: pointer to slot, returned by radix_tree_lookup_slot
- * @item: new item to store in the slot.
- *
- * For use with radix_tree_lookup_slot(). Caller must hold tree write locked
- * across slot lookup and replacement.
- */
-static inline void radix_tree_replace_slot(void **pslot, void *item)
-{
- BUG_ON(radix_tree_is_internal_node(item));
- rcu_assign_pointer(*pslot, item);
-}
-
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
unsigned order, struct radix_tree_node **nodep,
void ***slotp);
@@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
void **slot, void *item);
+void radix_tree_replace_slot(struct radix_tree_root *root,
+ void **slot, void *item);
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7885796d35ae..f91d5b0af654 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -753,19 +753,10 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);
-/**
- * __radix_tree_replace - replace item in a slot
- * @root: radix tree root
- * @node: pointer to tree node
- * @slot: pointer to slot in @node
- * @item: new item to store in the slot.
- *
- * For use with __radix_tree_lookup(). Caller must hold tree write locked
- * across slot lookup and replacement.
- */
-void __radix_tree_replace(struct radix_tree_root *root,
- struct radix_tree_node *node,
- void **slot, void *item)
+static void replace_slot(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item,
+ bool warn_typeswitch)
{
void *old = rcu_dereference_raw(*slot);
int exceptional;
@@ -776,7 +767,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
exceptional = !!radix_tree_exceptional_entry(item) -
!!radix_tree_exceptional_entry(old);
- WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode);
+ WARN_ON_ONCE(warn_typeswitch && exceptional);
if (node)
node->exceptional += exceptional;
@@ -785,6 +776,50 @@ void __radix_tree_replace(struct radix_tree_root *root,
}
/**
+ * __radix_tree_replace - replace item in a slot
+ * @root: radix tree root
+ * @node: pointer to tree node
+ * @slot: pointer to slot in @node
+ * @item: new item to store in the slot.
+ *
+ * For use with __radix_tree_lookup(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+ struct radix_tree_node *node,
+ void **slot, void *item)
+{
+ /*
+ * This function supports replacing exceptional entries, but
+ * that needs accounting against the node unless the slot is
+ * root->rnode.
+ */
+ replace_slot(root, node, slot, item,
+ !node && slot != (void **)&root->rnode);
+}
+
+/**
+ * radix_tree_replace_slot - replace item in a slot
+ * @root: radix tree root
+ * @slot: pointer to slot
+ * @item: new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
+ * across slot lookup and replacement.
+ *
+ * NOTE: This cannot be used to switch between non-entries (empty slots),
+ * regular entries, and exceptional entries, as that requires accounting
+ * inside the radix tree node. When switching from one type of entry to
+ * another, use __radix_tree_lookup() and __radix_tree_replace().
+ */
+void radix_tree_replace_slot(struct radix_tree_root *root,
+ void **slot, void *item)
+{
+ replace_slot(root, NULL, slot, item, true);
+}
+
+/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
* @index: index key
diff --git a/mm/filemap.c b/mm/filemap.c
index c7fe2f16503f..eb463156f29a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
false);
}
}
- radix_tree_replace_slot(slot, page);
+ radix_tree_replace_slot(&mapping->page_tree, slot, page);
mapping->nrpages++;
if (node) {
workingset_node_pages_inc(node);
@@ -193,7 +193,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
shadow = NULL;
}
- radix_tree_replace_slot(slot, shadow);
+ radix_tree_replace_slot(&mapping->page_tree, slot, shadow);
if (!node)
break;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d553c294de40..c2f015bd57cb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1424,7 +1424,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(slot,
+ radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
slot = radix_tree_iter_next(&iter);
@@ -1536,7 +1536,8 @@ static void collapse_shmem(struct mm_struct *mm,
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(slot, page);
+ radix_tree_replace_slot(&mapping->page_tree,
+ slot, page);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 99250aee1ac1..9b88e4e37d0a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
SetPageDirty(newpage);
}
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
/*
* Drop cache reference from old page by unfreezing
@@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
get_page(newpage);
- radix_tree_replace_slot(pslot, newpage);
+ radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
page_ref_unfreeze(page, expected_count - 1);
diff --git a/mm/truncate.c b/mm/truncate.c
index a01cce450a26..6ae44571d4c7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping,
goto unlock;
if (*slot != entry)
goto unlock;
- radix_tree_replace_slot(slot, NULL);
+ radix_tree_replace_slot(&mapping->page_tree, slot, NULL);
mapping->nrexceptional--;
if (!node)
goto unlock;
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 05d7bc488971..d1be94667a30 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order)
slot = radix_tree_lookup_slot(&tree, index);
free(*slot);
- radix_tree_replace_slot(slot, item2);
+ radix_tree_replace_slot(&tree, slot, item2);
for (i = min; i < max; i++) {
struct item *item = item_lookup(&tree, i);
assert(item != 0);
--
2.10.2
Page cache shadow entry handling will be a lot simpler when it can use
a single generic replacement function for pages, shadow entries, and
emptying slots.
Make __radix_tree_replace() properly account insertions and deletions
in node->count and garbage collect nodes as they become empty. Then
re-implement radix_tree_delete() on top of it.
Signed-off-by: Johannes Weiner <[email protected]>
---
lib/radix-tree.c | 227 ++++++++++++++++++++++++++++---------------------------
1 file changed, 116 insertions(+), 111 deletions(-)
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index f91d5b0af654..5d8930f3b3d8 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -539,6 +539,107 @@ static int radix_tree_extend(struct radix_tree_root *root,
}
/**
+ * radix_tree_shrink - shrink radix tree to minimum height
+ * @root radix tree root
+ */
+static inline bool radix_tree_shrink(struct radix_tree_root *root)
+{
+ bool shrunk = false;
+
+ for (;;) {
+ struct radix_tree_node *node = root->rnode;
+ struct radix_tree_node *child;
+
+ if (!radix_tree_is_internal_node(node))
+ break;
+ node = entry_to_node(node);
+
+ /*
+ * The candidate node has more than one child, or its child
+ * is not at the leftmost slot, or the child is a multiorder
+ * entry, we cannot shrink.
+ */
+ if (node->count != 1)
+ break;
+ child = node->slots[0];
+ if (!child)
+ break;
+ if (!radix_tree_is_internal_node(child) && node->shift)
+ break;
+
+ if (radix_tree_is_internal_node(child))
+ entry_to_node(child)->parent = NULL;
+
+ /*
+ * We don't need rcu_assign_pointer(), since we are simply
+ * moving the node from one part of the tree to another: if it
+ * was safe to dereference the old pointer to it
+ * (node->slots[0]), it will be safe to dereference the new
+ * one (root->rnode) as far as dependent read barriers go.
+ */
+ root->rnode = child;
+
+ /*
+ * We have a dilemma here. The node's slot[0] must not be
+ * NULLed in case there are concurrent lookups expecting to
+ * find the item. However if this was a bottom-level node,
+ * then it may be subject to the slot pointer being visible
+ * to callers dereferencing it. If item corresponding to
+ * slot[0] is subsequently deleted, these callers would expect
+ * their slot to become empty sooner or later.
+ *
+ * For example, lockless pagecache will look up a slot, deref
+ * the page pointer, and if the page has 0 refcount it means it
+ * was concurrently deleted from pagecache so try the deref
+ * again. Fortunately there is already a requirement for logic
+ * to retry the entire slot lookup -- the indirect pointer
+ * problem (replacing direct root node with an indirect pointer
+ * also results in a stale slot). So tag the slot as indirect
+ * to force callers to retry.
+ */
+ if (!radix_tree_is_internal_node(child))
+ node->slots[0] = RADIX_TREE_RETRY;
+
+ radix_tree_node_free(node);
+ shrunk = true;
+ }
+
+ return shrunk;
+}
+
+static bool delete_node(struct radix_tree_root *root,
+ struct radix_tree_node *node)
+{
+ bool deleted = false;
+
+ do {
+ struct radix_tree_node *parent;
+
+ if (node->count) {
+ if (node == entry_to_node(root->rnode))
+ deleted |= radix_tree_shrink(root);
+ return deleted;
+ }
+
+ parent = node->parent;
+ if (parent) {
+ parent->slots[node->offset] = NULL;
+ parent->count--;
+ } else {
+ root_tag_clear_all(root);
+ root->rnode = NULL;
+ }
+
+ radix_tree_node_free(node);
+ deleted = true;
+
+ node = parent;
+ } while (node);
+
+ return deleted;
+}
+
+/**
* __radix_tree_create - create a slot in a radix tree
* @root: radix tree root
* @index: index key
@@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root,
bool warn_typeswitch)
{
void *old = rcu_dereference_raw(*slot);
- int exceptional;
+ int count, exceptional;
WARN_ON_ONCE(radix_tree_is_internal_node(item));
- WARN_ON_ONCE(!!item - !!old);
+ count = !!item - !!old;
exceptional = !!radix_tree_exceptional_entry(item) -
!!radix_tree_exceptional_entry(old);
- WARN_ON_ONCE(warn_typeswitch && exceptional);
+ WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
- if (node)
+ if (node) {
+ node->count += count;
node->exceptional += exceptional;
+ }
rcu_assign_pointer(*slot, item);
}
@@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root,
void **slot, void *item)
{
/*
- * This function supports replacing exceptional entries, but
- * that needs accounting against the node unless the slot is
- * root->rnode.
+ * This function supports replacing exceptional entries and
+ * deleting entries, but that needs accounting against the
+ * node unless the slot is root->rnode.
*/
replace_slot(root, node, slot, item,
!node && slot != (void **)&root->rnode);
+
+ delete_node(root, node);
}
/**
@@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
*
* NOTE: This cannot be used to switch between non-entries (empty slots),
* regular entries, and exceptional entries, as that requires accounting
- * inside the radix tree node. When switching from one type of entry to
- * another, use __radix_tree_lookup() and __radix_tree_replace().
+ * inside the radix tree node. When switching from one type of entry or
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace().
*/
void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item)
@@ -1467,75 +1572,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
#endif /* CONFIG_SHMEM && CONFIG_SWAP */
/**
- * radix_tree_shrink - shrink radix tree to minimum height
- * @root radix tree root
- */
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
-{
- bool shrunk = false;
-
- for (;;) {
- struct radix_tree_node *node = root->rnode;
- struct radix_tree_node *child;
-
- if (!radix_tree_is_internal_node(node))
- break;
- node = entry_to_node(node);
-
- /*
- * The candidate node has more than one child, or its child
- * is not at the leftmost slot, or the child is a multiorder
- * entry, we cannot shrink.
- */
- if (node->count != 1)
- break;
- child = node->slots[0];
- if (!child)
- break;
- if (!radix_tree_is_internal_node(child) && node->shift)
- break;
-
- if (radix_tree_is_internal_node(child))
- entry_to_node(child)->parent = NULL;
-
- /*
- * We don't need rcu_assign_pointer(), since we are simply
- * moving the node from one part of the tree to another: if it
- * was safe to dereference the old pointer to it
- * (node->slots[0]), it will be safe to dereference the new
- * one (root->rnode) as far as dependent read barriers go.
- */
- root->rnode = child;
-
- /*
- * We have a dilemma here. The node's slot[0] must not be
- * NULLed in case there are concurrent lookups expecting to
- * find the item. However if this was a bottom-level node,
- * then it may be subject to the slot pointer being visible
- * to callers dereferencing it. If item corresponding to
- * slot[0] is subsequently deleted, these callers would expect
- * their slot to become empty sooner or later.
- *
- * For example, lockless pagecache will look up a slot, deref
- * the page pointer, and if the page has 0 refcount it means it
- * was concurrently deleted from pagecache so try the deref
- * again. Fortunately there is already a requirement for logic
- * to retry the entire slot lookup -- the indirect pointer
- * problem (replacing direct root node with an indirect pointer
- * also results in a stale slot). So tag the slot as indirect
- * to force callers to retry.
- */
- if (!radix_tree_is_internal_node(child))
- node->slots[0] = RADIX_TREE_RETRY;
-
- radix_tree_node_free(node);
- shrunk = true;
- }
-
- return shrunk;
-}
-
-/**
* __radix_tree_delete_node - try to free node after clearing a slot
* @root: radix tree root
* @node: node containing @index
@@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
- bool deleted = false;
-
- do {
- struct radix_tree_node *parent;
-
- if (node->count) {
- if (node == entry_to_node(root->rnode))
- deleted |= radix_tree_shrink(root);
- return deleted;
- }
-
- parent = node->parent;
- if (parent) {
- parent->slots[node->offset] = NULL;
- parent->count--;
- } else {
- root_tag_clear_all(root);
- root->rnode = NULL;
- }
-
- radix_tree_node_free(node);
- deleted = true;
-
- node = parent;
- } while (node);
-
- return deleted;
+ return delete_node(root, node);
}
static inline void delete_sibling_entries(struct radix_tree_node *node,
@@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
node_tag_clear(root, node, tag, offset);
delete_sibling_entries(node, node_to_entry(slot), offset);
- node->slots[offset] = NULL;
- node->count--;
- if (radix_tree_exceptional_entry(entry))
- node->exceptional--;
-
- __radix_tree_delete_node(root, node);
+ __radix_tree_replace(root, node, slot, NULL);
return entry;
}
--
2.10.2
Support handing __radix_tree_replace() a callback that gets invoked
for all leaf nodes that change or get freed as a result of the slot
replacement, to assist users tracking nodes with node->private_list.
This prepares for putting page cache shadow entries into the radix
tree root again and drastically simplifying the shadow tracking.
Suggested-by: Jan Kara <[email protected]>
Signed-off-by: Johannes Weiner <[email protected]>
---
fs/dax.c | 3 ++-
include/linux/radix-tree.h | 4 +++-
lib/radix-tree.c | 42 +++++++++++++++++++++++++++++-------------
mm/shmem.c | 3 ++-
4 files changed, 36 insertions(+), 16 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 85930c2a2749..6916ed37d463 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
ret = __radix_tree_lookup(page_tree, index, &node, &slot);
WARN_ON_ONCE(ret != entry);
- __radix_tree_replace(page_tree, node, slot, new_entry);
+ __radix_tree_replace(page_tree, node, slot,
+ new_entry, NULL, NULL);
}
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 2d1b9b8be983..15c972ea9510 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
struct radix_tree_node **nodep, void ***slotp);
void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
- void **slot, void *item);
+ void **slot, void *item,
+ radix_tree_update_node_t update_node, void *private);
void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item);
bool __radix_tree_delete_node(struct radix_tree_root *root,
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 5d8930f3b3d8..df4ff18dd63c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
tag_clear(node, i, 0);
node->slots[0] = NULL;
- node->count = 0;
kmem_cache_free(radix_tree_node_cachep, node);
}
@@ -542,7 +541,9 @@ static int radix_tree_extend(struct radix_tree_root *root,
* radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
+static inline bool radix_tree_shrink(struct radix_tree_root *root,
+ radix_tree_update_node_t update_node,
+ void *private)
{
bool shrunk = false;
@@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
- if (!radix_tree_is_internal_node(child))
+ node->count = 0;
+ if (!radix_tree_is_internal_node(child)) {
node->slots[0] = RADIX_TREE_RETRY;
+ if (update_node)
+ update_node(node, private);
+ }
radix_tree_node_free(node);
shrunk = true;
@@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
}
static bool delete_node(struct radix_tree_root *root,
- struct radix_tree_node *node)
+ struct radix_tree_node *node,
+ radix_tree_update_node_t update_node, void *private)
{
bool deleted = false;
@@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root,
if (node->count) {
if (node == entry_to_node(root->rnode))
- deleted |= radix_tree_shrink(root);
+ deleted |= radix_tree_shrink(root, update_node,
+ private);
return deleted;
}
@@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root,
/**
* __radix_tree_replace - replace item in a slot
- * @root: radix tree root
- * @node: pointer to tree node
- * @slot: pointer to slot in @node
- * @item: new item to store in the slot.
+ * @root: radix tree root
+ * @node: pointer to tree node
+ * @slot: pointer to slot in @node
+ * @item: new item to store in the slot.
+ * @update_node: callback for changing leaf nodes
+ * @private: private data to pass to @update_node
*
* For use with __radix_tree_lookup(). Caller must hold tree write locked
* across slot lookup and replacement.
*/
void __radix_tree_replace(struct radix_tree_root *root,
struct radix_tree_node *node,
- void **slot, void *item)
+ void **slot, void *item,
+ radix_tree_update_node_t update_node, void *private)
{
/*
* This function supports replacing exceptional entries and
@@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root,
replace_slot(root, node, slot, item,
!node && slot != (void **)&root->rnode);
- delete_node(root, node);
+ if (!node)
+ return;
+
+ if (update_node)
+ update_node(node, private);
+
+ delete_node(root, node, update_node, private);
}
/**
@@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
bool __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
- return delete_node(root, node);
+ return delete_node(root, node, NULL, NULL);
}
static inline void delete_sibling_entries(struct radix_tree_node *node,
@@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
node_tag_clear(root, node, tag, offset);
delete_sibling_entries(node, node_to_entry(slot), offset);
- __radix_tree_replace(root, node, slot, NULL);
+ __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
return entry;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 7f3a08df25c9..62ac381069fc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
return -ENOENT;
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->page_tree, node, pslot, replacement);
+ __radix_tree_replace(&mapping->page_tree, node, pslot,
+ replacement, NULL, NULL);
return 0;
}
--
2.10.2
Currently, we track the shadow entries in the page cache in the upper
bits of the radix_tree_node->count, behind the back of the radix tree
implementation. Because the radix tree code has no awareness of them,
we rely on random subtleties throughout the implementation (such as
the node->count != 1 check in the shrinking code, which is meant to
exclude multi-entry nodes but also happens to skip nodes with only one
shadow entry, as that's accounted in the upper bits). This is error
prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm:
filemap: don't plant shadow entries without radix tree node").
To remove these subtleties, this patch moves shadow entry tracking
from the upper bits of node->count to the existing counter for
exceptional entries. node->count goes back to being a simple counter
of valid entries in the tree node and can be shrunk to a single byte.
This vastly simplifies the page cache code. All accounting happens
natively inside the radix tree implementation, and maintaining the LRU
linkage of shadow nodes is consolidated into a single function in the
workingset code that is called for leaf nodes affected by a change in
the page cache tree.
This also removes the last user of the __radix_delete_node() return
value. Eliminate it.
Signed-off-by: Johannes Weiner <[email protected]>
---
include/linux/radix-tree.h | 8 ++-----
include/linux/swap.h | 34 +---------------------------
lib/radix-tree.c | 25 +++++----------------
mm/filemap.c | 54 +++++---------------------------------------
mm/truncate.c | 21 +++--------------
mm/workingset.c | 56 +++++++++++++++++++++++++++++++++++-----------
6 files changed, 60 insertions(+), 138 deletions(-)
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 15c972ea9510..744486057e9e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
RADIX_TREE_MAP_SHIFT))
-/* Internally used bits of node->count */
-#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
-#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
-
struct radix_tree_node {
unsigned char shift; /* Bits remaining in each slot */
unsigned char offset; /* Slot offset in parent */
- unsigned int count; /* Total entry count */
+ unsigned char count; /* Total entry count */
unsigned char exceptional; /* Exceptional entry count */
union {
struct {
@@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
radix_tree_update_node_t update_node, void *private);
void radix_tree_replace_slot(struct radix_tree_root *root,
void **slot, void *item);
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a56523cefb9b..09b212d37f1d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -246,39 +246,7 @@ struct swap_info_struct {
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
void workingset_activation(struct page *page);
-extern struct list_lru workingset_shadow_nodes;
-
-static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
-{
- return node->count & RADIX_TREE_COUNT_MASK;
-}
-
-static inline void workingset_node_pages_inc(struct radix_tree_node *node)
-{
- node->count++;
-}
-
-static inline void workingset_node_pages_dec(struct radix_tree_node *node)
-{
- VM_WARN_ON_ONCE(!workingset_node_pages(node));
- node->count--;
-}
-
-static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
-{
- return node->count >> RADIX_TREE_COUNT_SHIFT;
-}
-
-static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
-{
- node->count += 1U << RADIX_TREE_COUNT_SHIFT;
-}
-
-static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
-{
- VM_WARN_ON_ONCE(!workingset_node_shadows(node));
- node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
-}
+void workingset_update_node(struct radix_tree_node *node, void *private);
/* linux/mm/page_alloc.c */
extern unsigned long totalram_pages;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index df4ff18dd63c..9dbfaac05e6c 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -541,12 +541,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
* radix_tree_shrink - shrink radix tree to minimum height
* @root radix tree root
*/
-static inline bool radix_tree_shrink(struct radix_tree_root *root,
+static inline void radix_tree_shrink(struct radix_tree_root *root,
radix_tree_update_node_t update_node,
void *private)
{
- bool shrunk = false;
-
for (;;) {
struct radix_tree_node *node = root->rnode;
struct radix_tree_node *child;
@@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
}
radix_tree_node_free(node);
- shrunk = true;
}
-
- return shrunk;
}
-static bool delete_node(struct radix_tree_root *root,
+static void delete_node(struct radix_tree_root *root,
struct radix_tree_node *node,
radix_tree_update_node_t update_node, void *private)
{
- bool deleted = false;
-
do {
struct radix_tree_node *parent;
if (node->count) {
if (node == entry_to_node(root->rnode))
- deleted |= radix_tree_shrink(root, update_node,
- private);
- return deleted;
+ radix_tree_shrink(root, update_node, private);
+ return;
}
parent = node->parent;
@@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root,
}
radix_tree_node_free(node);
- deleted = true;
node = parent;
} while (node);
-
- return deleted;
}
/**
@@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
* After clearing the slot at @index in @node from radix tree
* rooted at @root, call this function to attempt freeing the
* node and shrinking the tree.
- *
- * Returns %true if @node was freed, %false otherwise.
*/
-bool __radix_tree_delete_node(struct radix_tree_root *root,
+void __radix_tree_delete_node(struct radix_tree_root *root,
struct radix_tree_node *node)
{
- return delete_node(root, node, NULL, NULL);
+ delete_node(root, node, NULL, NULL);
}
static inline void delete_sibling_entries(struct radix_tree_node *node,
diff --git a/mm/filemap.c b/mm/filemap.c
index eb463156f29a..7d92032277ff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping,
if (!dax_mapping(mapping)) {
if (shadowp)
*shadowp = p;
- if (node)
- workingset_node_shadows_dec(node);
} else {
/* DAX can replace empty locked entry with a hole */
WARN_ON_ONCE(p !=
(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
RADIX_DAX_ENTRY_LOCK));
- /* DAX accounts exceptional entries as normal pages */
- if (node)
- workingset_node_pages_dec(node);
/* Wakeup waiters for exceptional entry lock */
dax_wake_mapping_entry_waiter(mapping, page->index,
false);
}
}
- radix_tree_replace_slot(&mapping->page_tree, slot, page);
+ __radix_tree_replace(&mapping->page_tree, node, slot, page,
+ workingset_update_node, mapping);
mapping->nrpages++;
- if (node) {
- workingset_node_pages_inc(node);
- /*
- * Don't track node that contains actual pages.
- *
- * Avoid acquiring the list_lru lock if already
- * untracked. The list_empty() test is safe as
- * node->private_list is protected by
- * mapping->tree_lock.
- */
- if (!list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- }
return 0;
}
@@ -182,8 +164,6 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index + i,
&node, &slot);
- radix_tree_clear_tags(&mapping->page_tree, node, slot);
-
if (!node) {
VM_BUG_ON_PAGE(nr != 1, page);
/*
@@ -193,33 +173,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
shadow = NULL;
}
- radix_tree_replace_slot(&mapping->page_tree, slot, shadow);
-
- if (!node)
- break;
-
- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
- continue;
-
- /*
- * Track node that only contains shadow entries. DAX mappings
- * contain no shadow entries and may contain other exceptional
- * entries so skip those.
- *
- * Avoid acquiring the list_lru lock if already tracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes,
- &node->private_list);
- }
+ radix_tree_clear_tags(&mapping->page_tree, node, slot);
+ __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
+ workingset_update_node, mapping);
}
if (shadow) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 6ae44571d4c7..7e5464a611db 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
- &slot))
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
goto unlock;
if (*slot != entry)
goto unlock;
- radix_tree_replace_slot(&mapping->page_tree, slot, NULL);
+ __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
+ workingset_update_node, mapping);
mapping->nrexceptional--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 3cfc61d84a52..111e06559892 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -334,18 +335,45 @@ void workingset_activation(struct page *page)
* point where they would still be useful.
*/
-struct list_lru workingset_shadow_nodes;
+static struct list_lru shadow_nodes;
+
+void workingset_update_node(struct radix_tree_node *node, void *private)
+{
+ struct address_space *mapping = private;
+
+ /* Only regular page cache has shadow entries */
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+
+ /*
+ * Track non-empty nodes that contain only shadow entries;
+ * unlink those that contain pages or are being freed.
+ *
+ * Avoid acquiring the list_lru lock when the nodes are
+ * already where they should be. The list_empty() test is safe
+ * as node->private_list is protected by &mapping->tree_lock.
+ */
+ if (node->count && node->count == node->exceptional) {
+ if (list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&shadow_nodes, &node->private_list);
+ }
+ } else {
+ if (!list_empty(&node->private_list))
+ list_lru_del(&shadow_nodes, &node->private_list);
+ }
+}
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long shadow_nodes;
unsigned long max_nodes;
+ unsigned long nodes;
unsigned long pages;
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+ nodes = list_lru_shrink_count(&shadow_nodes, sc);
local_irq_enable();
if (memcg_kmem_enabled()) {
@@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
*/
max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
- if (shadow_nodes <= max_nodes)
+ if (nodes <= max_nodes)
return 0;
- return shadow_nodes - max_nodes;
+ return nodes - max_nodes;
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
@@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- if (WARN_ON_ONCE(!workingset_node_shadows(node)))
+ if (WARN_ON_ONCE(!node->exceptional))
goto out_invalid;
- if (WARN_ON_ONCE(workingset_node_pages(node)))
+ if (WARN_ON_ONCE(node->count != node->exceptional))
goto out_invalid;
for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
if (node->slots[i]) {
if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
goto out_invalid;
+ if (WARN_ON_ONCE(!node->exceptional))
+ goto out_invalid;
if (WARN_ON_ONCE(!mapping->nrexceptional))
goto out_invalid;
node->slots[i] = NULL;
- workingset_node_shadows_dec(node);
+ node->exceptional--;
+ node->count--;
mapping->nrexceptional--;
}
}
- if (WARN_ON_ONCE(workingset_node_shadows(node)))
+ if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node);
@@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
- shadow_lru_isolate, NULL);
+ ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
@@ -496,7 +526,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
ret = register_shrinker(&workingset_shadow_shrinker);
@@ -504,7 +534,7 @@ static int __init workingset_init(void)
goto err_list_lru;
return 0;
err_list_lru:
- list_lru_destroy(&workingset_shadow_nodes);
+ list_lru_destroy(&shadow_nodes);
err:
return ret;
}
--
2.10.2
Shadow entries in the page cache used to be accounted behind the radix
tree implementation's back in the upper bits of node->count, and the
radix tree code extending a single-entry tree with a shadow entry in
root->rnode would corrupt that counter. As a result, we could not put
shadow entries at index 0 if the tree didn't have any other entries,
and that means no refault detection for any single-page file.
Now that the shadow entries are tracked natively in the radix tree's
exceptional counter, this is no longer necessary. Extending and
shrinking the tree from and to single entries in root->rnode now does
the right thing when the entry is exceptional, remove that limitation.
Signed-off-by: Johannes Weiner <[email protected]>
---
mm/filemap.c | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7d92032277ff..ae7b6992aded 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -164,14 +164,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index + i,
&node, &slot);
- if (!node) {
- VM_BUG_ON_PAGE(nr != 1, page);
- /*
- * We need a node to properly account shadow
- * entries. Don't plant any without. XXX
- */
- shadow = NULL;
- }
+ VM_BUG_ON_PAGE(!node && nr != 1, page);
radix_tree_clear_tags(&mapping->page_tree, node, slot);
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
--
2.10.2
On Thu, Nov 17, 2016 at 02:11:30PM -0500, Johannes Weiner wrote:
> When a radix tree iteration drops the tree lock, another thread might
> swoop in and free the node holding the current slot. The iteration
> needs to do another tree lookup from the current index to continue.
>
> [[email protected]: re-lookup for replacement]
> Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Thu, Nov 17, 2016 at 02:11:31PM -0500, Johannes Weiner wrote:
> The radix tree counts valid entries in each tree node. Entries stored
> in the tree cannot be removed by simpling storing NULL in the slot or
> the internal counters will be off and the node never gets freed again.
>
> When collapsing a shmem page fails, restore the holes that were filled
> with radix_tree_insert() with a proper radix tree deletion.
>
> Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
> Reported-by: Jan Kara <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>
Acked-by: Kirill A. Shutemov <[email protected]>
--
Kirill A. Shutemov
On Thu 17-11-16 14:11:30, Johannes Weiner wrote:
> When a radix tree iteration drops the tree lock, another thread might
> swoop in and free the node holding the current slot. The iteration
> needs to do another tree lookup from the current index to continue.
>
> [[email protected]: re-lookup for replacement]
> Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
> Signed-off-by: Johannes Weiner <[email protected]>
The patch looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> mm/khugepaged.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 728d7790dc2d..bdfdab40a813 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1401,6 +1401,9 @@ static void collapse_shmem(struct mm_struct *mm,
>
> spin_lock_irq(&mapping->tree_lock);
>
> + slot = radix_tree_lookup_slot(&mapping->page_tree, index);
> + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
> + &mapping->tree_lock), page);
> VM_BUG_ON_PAGE(page_mapped(page), page);
>
> /*
> @@ -1424,6 +1427,7 @@ static void collapse_shmem(struct mm_struct *mm,
> radix_tree_replace_slot(slot,
> new_page + (index % HPAGE_PMD_NR));
>
> + slot = radix_tree_iter_next(&iter);
> index++;
> continue;
> out_lru:
> @@ -1535,6 +1539,7 @@ static void collapse_shmem(struct mm_struct *mm,
> putback_lru_page(page);
> unlock_page(page);
> spin_lock_irq(&mapping->tree_lock);
> + slot = radix_tree_iter_next(&iter);
> }
> VM_BUG_ON(nr_none);
> spin_unlock_irq(&mapping->tree_lock);
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:11:31, Johannes Weiner wrote:
> The radix tree counts valid entries in each tree node. Entries stored
> in the tree cannot be removed by simpling storing NULL in the slot or
> the internal counters will be off and the node never gets freed again.
>
> When collapsing a shmem page fails, restore the holes that were filled
> with radix_tree_insert() with a proper radix tree deletion.
>
> Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages")
> Reported-by: Jan Kara <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> mm/khugepaged.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index bdfdab40a813..d553c294de40 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1523,9 +1523,11 @@ static void collapse_shmem(struct mm_struct *mm,
> if (!page || iter.index < page->index) {
> if (!nr_none)
> break;
> - /* Put holes back where they were */
> - radix_tree_replace_slot(slot, NULL);
> nr_none--;
> + /* Put holes back where they were */
> + radix_tree_delete(&mapping->page_tree,
> + iter.index);
> + slot = radix_tree_iter_next(&iter);
> continue;
> }
>
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:11:32, Johannes Weiner wrote:
> When the shadow page shrinker tries to reclaim a radix tree node but
> finds it in an unexpected state - it should contain no pages, and
> non-zero shadow entries - there is no need to kill the executing task
> or even the entire system. Warn about the invalid state, then leave
> that tree node be. Simply don't put it back on the shadow LRU for
> future reclaim and move on.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> mm/workingset.c | 20 ++++++++++++--------
> 1 file changed, 12 insertions(+), 8 deletions(-)
>
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 617475f529f4..3cfc61d84a52 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -418,23 +418,27 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
> * no pages, so we expect to be able to remove them all and
> * delete and free the empty node afterwards.
> */
> - BUG_ON(!workingset_node_shadows(node));
> - BUG_ON(workingset_node_pages(node));
> -
> + if (WARN_ON_ONCE(!workingset_node_shadows(node)))
> + goto out_invalid;
> + if (WARN_ON_ONCE(workingset_node_pages(node)))
> + goto out_invalid;
> for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
> if (node->slots[i]) {
> - BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
> + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
> + goto out_invalid;
> + if (WARN_ON_ONCE(!mapping->nrexceptional))
> + goto out_invalid;
> node->slots[i] = NULL;
> workingset_node_shadows_dec(node);
> - BUG_ON(!mapping->nrexceptional);
> mapping->nrexceptional--;
> }
> }
> - BUG_ON(workingset_node_shadows(node));
> + if (WARN_ON_ONCE(workingset_node_shadows(node)))
> + goto out_invalid;
> inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
> - if (!__radix_tree_delete_node(&mapping->page_tree, node))
> - BUG();
> + __radix_tree_delete_node(&mapping->page_tree, node);
>
> +out_invalid:
> spin_unlock(&mapping->tree_lock);
> ret = LRU_REMOVED_RETRY;
> out:
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:29:45, Johannes Weiner wrote:
> The way the page cache is sneaking shadow entries of evicted pages
> into the radix tree past the node entry accounting and tracking them
> manually in the upper bits of node->count is fraught with problems.
>
> These shadow entries are marked in the tree as exceptional entries,
> which are a native concept to the radix tree. Maintain an explicit
> counter of exceptional entries in the radix tree node. Subsequent
> patches will switch shadow entry tracking over to that counter.
>
> DAX and shmem are the other users of exceptional entries. Since slot
> replacements that change the entry type from regular to exceptional
> must now be accounted, introduce a __radix_tree_replace() function
> that does replacement and accounting, and switch DAX and shmem over.
>
> The increase in radix tree node size is temporary. A followup patch
> switches the shadow tracking to this new scheme and we'll no longer
> need the upper bits in node->count and shrink that back to one byte.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good to me. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/dax.c | 5 +++--
> include/linux/radix-tree.h | 10 +++++++---
> lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
> mm/shmem.c | 8 ++++----
> 4 files changed, 57 insertions(+), 12 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 014defd2e744..db78bae0dc0f 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -643,12 +643,13 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
> }
> mapping->nrexceptional++;
> } else {
> + struct radix_tree_node *node;
> void **slot;
> void *ret;
>
> - ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
> + ret = __radix_tree_lookup(page_tree, index, &node, &slot);
> WARN_ON_ONCE(ret != entry);
> - radix_tree_replace_slot(slot, new_entry);
> + __radix_tree_replace(page_tree, node, slot, new_entry);
> }
> if (vmf->flags & FAULT_FLAG_WRITE)
> radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index af3581b8a451..7ced8a70cc8b 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -85,9 +85,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
> #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
>
> struct radix_tree_node {
> - unsigned char shift; /* Bits remaining in each slot */
> - unsigned char offset; /* Slot offset in parent */
> - unsigned int count;
> + unsigned char shift; /* Bits remaining in each slot */
> + unsigned char offset; /* Slot offset in parent */
> + unsigned int count; /* Total entry count */
> + unsigned char exceptional; /* Exceptional entry count */
> union {
> struct {
> /* Used when ascending tree */
> @@ -276,6 +277,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
> struct radix_tree_node **nodep, void ***slotp);
> void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
> void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
> +void __radix_tree_replace(struct radix_tree_root *root,
> + struct radix_tree_node *node,
> + void **slot, void *item);
> bool __radix_tree_delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node);
> void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 8e6d552c40dd..7885796d35ae 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
> {
> unsigned long i;
>
> - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
> + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
> node, node->offset,
> node->tags[0][0], node->tags[1][0], node->tags[2][0],
> - node->shift, node->count, node->parent);
> + node->shift, node->count, node->exceptional, node->parent);
>
> for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
> unsigned long first = index | (i << node->shift);
> @@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
> node->offset = 0;
> node->count = 1;
> node->parent = NULL;
> - if (radix_tree_is_internal_node(slot))
> + if (radix_tree_is_internal_node(slot)) {
> entry_to_node(slot)->parent = node;
> + } else {
> + /* Moving an exceptional root->rnode to a node */
> + if (radix_tree_exceptional_entry(slot))
> + node->exceptional = 1;
> + }
> node->slots[0] = slot;
> slot = node_to_entry(node);
> rcu_assign_pointer(root->rnode, slot);
> @@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
> if (node) {
> unsigned offset = get_slot_offset(node, slot);
> node->count++;
> + if (radix_tree_exceptional_entry(item))
> + node->exceptional++;
> BUG_ON(tag_get(node, 0, offset));
> BUG_ON(tag_get(node, 1, offset));
> BUG_ON(tag_get(node, 2, offset));
> @@ -747,6 +754,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
> EXPORT_SYMBOL(radix_tree_lookup);
>
> /**
> + * __radix_tree_replace - replace item in a slot
> + * @root: radix tree root
> + * @node: pointer to tree node
> + * @slot: pointer to slot in @node
> + * @item: new item to store in the slot.
> + *
> + * For use with __radix_tree_lookup(). Caller must hold tree write locked
> + * across slot lookup and replacement.
> + */
> +void __radix_tree_replace(struct radix_tree_root *root,
> + struct radix_tree_node *node,
> + void **slot, void *item)
> +{
> + void *old = rcu_dereference_raw(*slot);
> + int exceptional;
> +
> + WARN_ON_ONCE(radix_tree_is_internal_node(item));
> + WARN_ON_ONCE(!!item - !!old);
> +
> + exceptional = !!radix_tree_exceptional_entry(item) -
> + !!radix_tree_exceptional_entry(old);
> +
> + WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode);
> +
> + if (node)
> + node->exceptional += exceptional;
> +
> + rcu_assign_pointer(*slot, item);
> +}
> +
> +/**
> * radix_tree_tag_set - set a tag on a radix tree node
> * @root: radix tree root
> * @index: index key
> @@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
> delete_sibling_entries(node, node_to_entry(slot), offset);
> node->slots[offset] = NULL;
> node->count--;
> + if (radix_tree_exceptional_entry(entry))
> + node->exceptional--;
>
> __radix_tree_delete_node(root, node);
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ad7813d73ea7..7f3a08df25c9 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -300,18 +300,18 @@ void shmem_uncharge(struct inode *inode, long pages)
> static int shmem_radix_tree_replace(struct address_space *mapping,
> pgoff_t index, void *expected, void *replacement)
> {
> + struct radix_tree_node *node;
> void **pslot;
> void *item;
>
> VM_BUG_ON(!expected);
> VM_BUG_ON(!replacement);
> - pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
> - if (!pslot)
> + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
> + if (!item)
> return -ENOENT;
> - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
> if (item != expected)
> return -ENOENT;
> - radix_tree_replace_slot(pslot, replacement);
> + __radix_tree_replace(&mapping->page_tree, node, pslot, replacement);
> return 0;
> }
>
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:30:21, Johannes Weiner wrote:
> The bug in khugepaged fixed earlier in this series shows that radix
> tree slot replacement is fragile; and it will become more so when not
> only NULL<->!NULL transitions need to be caught but transitions from
> and to exceptional entries as well. We need checks.
>
> Re-implement radix_tree_replace_slot() on top of the sanity-checked
> __radix_tree_replace(). This requires existing callers to also pass
> the radix tree root, but it'll warn us when somebody replaces slots
> with contents that need proper accounting (transitions between NULL
> entries, real entries, exceptional entries) and where a replacement
> through the slot pointer would corrupt the radix tree node counts.
>
> Suggested-by: Jan Kara <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
One nit below:
> @@ -785,6 +776,50 @@ void __radix_tree_replace(struct radix_tree_root *root,
> }
>
> /**
> + * __radix_tree_replace - replace item in a slot
> + * @root: radix tree root
> + * @node: pointer to tree node
> + * @slot: pointer to slot in @node
> + * @item: new item to store in the slot.
> + *
> + * For use with __radix_tree_lookup(). Caller must hold tree write locked
> + * across slot lookup and replacement.
> + */
I'd comment here that even this function cannot be used for NULL <->
non-NULL replacements. For that are radix_tree_delete() and
radix_tree_insert().
Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:30:58, Johannes Weiner wrote:
> Page cache shadow entry handling will be a lot simpler when it can use
> a single generic replacement function for pages, shadow entries, and
> emptying slots.
>
> Make __radix_tree_replace() properly account insertions and deletions
> in node->count and garbage collect nodes as they become empty. Then
> re-implement radix_tree_delete() on top of it.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Seeing this patch, just ignore my nit to the previous patch. Also it would
have been easier to review this patch if you split out the move of those
two functions into a separate patch and state it's just a code move...
Anyway, the result looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> lib/radix-tree.c | 227 ++++++++++++++++++++++++++++---------------------------
> 1 file changed, 116 insertions(+), 111 deletions(-)
>
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index f91d5b0af654..5d8930f3b3d8 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -539,6 +539,107 @@ static int radix_tree_extend(struct radix_tree_root *root,
> }
>
> /**
> + * radix_tree_shrink - shrink radix tree to minimum height
> + * @root radix tree root
> + */
> +static inline bool radix_tree_shrink(struct radix_tree_root *root)
> +{
> + bool shrunk = false;
> +
> + for (;;) {
> + struct radix_tree_node *node = root->rnode;
> + struct radix_tree_node *child;
> +
> + if (!radix_tree_is_internal_node(node))
> + break;
> + node = entry_to_node(node);
> +
> + /*
> + * The candidate node has more than one child, or its child
> + * is not at the leftmost slot, or the child is a multiorder
> + * entry, we cannot shrink.
> + */
> + if (node->count != 1)
> + break;
> + child = node->slots[0];
> + if (!child)
> + break;
> + if (!radix_tree_is_internal_node(child) && node->shift)
> + break;
> +
> + if (radix_tree_is_internal_node(child))
> + entry_to_node(child)->parent = NULL;
> +
> + /*
> + * We don't need rcu_assign_pointer(), since we are simply
> + * moving the node from one part of the tree to another: if it
> + * was safe to dereference the old pointer to it
> + * (node->slots[0]), it will be safe to dereference the new
> + * one (root->rnode) as far as dependent read barriers go.
> + */
> + root->rnode = child;
> +
> + /*
> + * We have a dilemma here. The node's slot[0] must not be
> + * NULLed in case there are concurrent lookups expecting to
> + * find the item. However if this was a bottom-level node,
> + * then it may be subject to the slot pointer being visible
> + * to callers dereferencing it. If item corresponding to
> + * slot[0] is subsequently deleted, these callers would expect
> + * their slot to become empty sooner or later.
> + *
> + * For example, lockless pagecache will look up a slot, deref
> + * the page pointer, and if the page has 0 refcount it means it
> + * was concurrently deleted from pagecache so try the deref
> + * again. Fortunately there is already a requirement for logic
> + * to retry the entire slot lookup -- the indirect pointer
> + * problem (replacing direct root node with an indirect pointer
> + * also results in a stale slot). So tag the slot as indirect
> + * to force callers to retry.
> + */
> + if (!radix_tree_is_internal_node(child))
> + node->slots[0] = RADIX_TREE_RETRY;
> +
> + radix_tree_node_free(node);
> + shrunk = true;
> + }
> +
> + return shrunk;
> +}
> +
> +static bool delete_node(struct radix_tree_root *root,
> + struct radix_tree_node *node)
> +{
> + bool deleted = false;
> +
> + do {
> + struct radix_tree_node *parent;
> +
> + if (node->count) {
> + if (node == entry_to_node(root->rnode))
> + deleted |= radix_tree_shrink(root);
> + return deleted;
> + }
> +
> + parent = node->parent;
> + if (parent) {
> + parent->slots[node->offset] = NULL;
> + parent->count--;
> + } else {
> + root_tag_clear_all(root);
> + root->rnode = NULL;
> + }
> +
> + radix_tree_node_free(node);
> + deleted = true;
> +
> + node = parent;
> + } while (node);
> +
> + return deleted;
> +}
> +
> +/**
> * __radix_tree_create - create a slot in a radix tree
> * @root: radix tree root
> * @index: index key
> @@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root,
> bool warn_typeswitch)
> {
> void *old = rcu_dereference_raw(*slot);
> - int exceptional;
> + int count, exceptional;
>
> WARN_ON_ONCE(radix_tree_is_internal_node(item));
> - WARN_ON_ONCE(!!item - !!old);
>
> + count = !!item - !!old;
> exceptional = !!radix_tree_exceptional_entry(item) -
> !!radix_tree_exceptional_entry(old);
>
> - WARN_ON_ONCE(warn_typeswitch && exceptional);
> + WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
>
> - if (node)
> + if (node) {
> + node->count += count;
> node->exceptional += exceptional;
> + }
>
> rcu_assign_pointer(*slot, item);
> }
> @@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root,
> void **slot, void *item)
> {
> /*
> - * This function supports replacing exceptional entries, but
> - * that needs accounting against the node unless the slot is
> - * root->rnode.
> + * This function supports replacing exceptional entries and
> + * deleting entries, but that needs accounting against the
> + * node unless the slot is root->rnode.
> */
> replace_slot(root, node, slot, item,
> !node && slot != (void **)&root->rnode);
> +
> + delete_node(root, node);
> }
>
> /**
> @@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
> *
> * NOTE: This cannot be used to switch between non-entries (empty slots),
> * regular entries, and exceptional entries, as that requires accounting
> - * inside the radix tree node. When switching from one type of entry to
> - * another, use __radix_tree_lookup() and __radix_tree_replace().
> + * inside the radix tree node. When switching from one type of entry or
> + * deleting, use __radix_tree_lookup() and __radix_tree_replace().
> */
> void radix_tree_replace_slot(struct radix_tree_root *root,
> void **slot, void *item)
> @@ -1467,75 +1572,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
> #endif /* CONFIG_SHMEM && CONFIG_SWAP */
>
> /**
> - * radix_tree_shrink - shrink radix tree to minimum height
> - * @root radix tree root
> - */
> -static inline bool radix_tree_shrink(struct radix_tree_root *root)
> -{
> - bool shrunk = false;
> -
> - for (;;) {
> - struct radix_tree_node *node = root->rnode;
> - struct radix_tree_node *child;
> -
> - if (!radix_tree_is_internal_node(node))
> - break;
> - node = entry_to_node(node);
> -
> - /*
> - * The candidate node has more than one child, or its child
> - * is not at the leftmost slot, or the child is a multiorder
> - * entry, we cannot shrink.
> - */
> - if (node->count != 1)
> - break;
> - child = node->slots[0];
> - if (!child)
> - break;
> - if (!radix_tree_is_internal_node(child) && node->shift)
> - break;
> -
> - if (radix_tree_is_internal_node(child))
> - entry_to_node(child)->parent = NULL;
> -
> - /*
> - * We don't need rcu_assign_pointer(), since we are simply
> - * moving the node from one part of the tree to another: if it
> - * was safe to dereference the old pointer to it
> - * (node->slots[0]), it will be safe to dereference the new
> - * one (root->rnode) as far as dependent read barriers go.
> - */
> - root->rnode = child;
> -
> - /*
> - * We have a dilemma here. The node's slot[0] must not be
> - * NULLed in case there are concurrent lookups expecting to
> - * find the item. However if this was a bottom-level node,
> - * then it may be subject to the slot pointer being visible
> - * to callers dereferencing it. If item corresponding to
> - * slot[0] is subsequently deleted, these callers would expect
> - * their slot to become empty sooner or later.
> - *
> - * For example, lockless pagecache will look up a slot, deref
> - * the page pointer, and if the page has 0 refcount it means it
> - * was concurrently deleted from pagecache so try the deref
> - * again. Fortunately there is already a requirement for logic
> - * to retry the entire slot lookup -- the indirect pointer
> - * problem (replacing direct root node with an indirect pointer
> - * also results in a stale slot). So tag the slot as indirect
> - * to force callers to retry.
> - */
> - if (!radix_tree_is_internal_node(child))
> - node->slots[0] = RADIX_TREE_RETRY;
> -
> - radix_tree_node_free(node);
> - shrunk = true;
> - }
> -
> - return shrunk;
> -}
> -
> -/**
> * __radix_tree_delete_node - try to free node after clearing a slot
> * @root: radix tree root
> * @node: node containing @index
> @@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
> bool __radix_tree_delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node)
> {
> - bool deleted = false;
> -
> - do {
> - struct radix_tree_node *parent;
> -
> - if (node->count) {
> - if (node == entry_to_node(root->rnode))
> - deleted |= radix_tree_shrink(root);
> - return deleted;
> - }
> -
> - parent = node->parent;
> - if (parent) {
> - parent->slots[node->offset] = NULL;
> - parent->count--;
> - } else {
> - root_tag_clear_all(root);
> - root->rnode = NULL;
> - }
> -
> - radix_tree_node_free(node);
> - deleted = true;
> -
> - node = parent;
> - } while (node);
> -
> - return deleted;
> + return delete_node(root, node);
> }
>
> static inline void delete_sibling_entries(struct radix_tree_node *node,
> @@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
> node_tag_clear(root, node, tag, offset);
>
> delete_sibling_entries(node, node_to_entry(slot), offset);
> - node->slots[offset] = NULL;
> - node->count--;
> - if (radix_tree_exceptional_entry(entry))
> - node->exceptional--;
> -
> - __radix_tree_delete_node(root, node);
> + __radix_tree_replace(root, node, slot, NULL);
>
> return entry;
> }
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:31:34, Johannes Weiner wrote:
> Support handing __radix_tree_replace() a callback that gets invoked
> for all leaf nodes that change or get freed as a result of the slot
> replacement, to assist users tracking nodes with node->private_list.
>
> This prepares for putting page cache shadow entries into the radix
> tree root again and drastically simplifying the shadow tracking.
>
> Suggested-by: Jan Kara <[email protected]>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> fs/dax.c | 3 ++-
> include/linux/radix-tree.h | 4 +++-
> lib/radix-tree.c | 42 +++++++++++++++++++++++++++++-------------
> mm/shmem.c | 3 ++-
> 4 files changed, 36 insertions(+), 16 deletions(-)
>
> diff --git a/fs/dax.c b/fs/dax.c
> index 85930c2a2749..6916ed37d463 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
>
> ret = __radix_tree_lookup(page_tree, index, &node, &slot);
> WARN_ON_ONCE(ret != entry);
> - __radix_tree_replace(page_tree, node, slot, new_entry);
> + __radix_tree_replace(page_tree, node, slot,
> + new_entry, NULL, NULL);
> }
> if (vmf->flags & FAULT_FLAG_WRITE)
> radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 2d1b9b8be983..15c972ea9510 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
> struct radix_tree_node **nodep, void ***slotp);
> void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
> void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
> +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
> void __radix_tree_replace(struct radix_tree_root *root,
> struct radix_tree_node *node,
> - void **slot, void *item);
> + void **slot, void *item,
> + radix_tree_update_node_t update_node, void *private);
> void radix_tree_replace_slot(struct radix_tree_root *root,
> void **slot, void *item);
> bool __radix_tree_delete_node(struct radix_tree_root *root,
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 5d8930f3b3d8..df4ff18dd63c 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
> tag_clear(node, i, 0);
>
> node->slots[0] = NULL;
> - node->count = 0;
>
> kmem_cache_free(radix_tree_node_cachep, node);
> }
> @@ -542,7 +541,9 @@ static int radix_tree_extend(struct radix_tree_root *root,
> * radix_tree_shrink - shrink radix tree to minimum height
> * @root radix tree root
> */
> -static inline bool radix_tree_shrink(struct radix_tree_root *root)
> +static inline bool radix_tree_shrink(struct radix_tree_root *root,
> + radix_tree_update_node_t update_node,
> + void *private)
> {
> bool shrunk = false;
>
> @@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
> * also results in a stale slot). So tag the slot as indirect
> * to force callers to retry.
> */
> - if (!radix_tree_is_internal_node(child))
> + node->count = 0;
> + if (!radix_tree_is_internal_node(child)) {
> node->slots[0] = RADIX_TREE_RETRY;
> + if (update_node)
> + update_node(node, private);
> + }
>
> radix_tree_node_free(node);
> shrunk = true;
> @@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
> }
>
> static bool delete_node(struct radix_tree_root *root,
> - struct radix_tree_node *node)
> + struct radix_tree_node *node,
> + radix_tree_update_node_t update_node, void *private)
> {
> bool deleted = false;
>
> @@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root,
>
> if (node->count) {
> if (node == entry_to_node(root->rnode))
> - deleted |= radix_tree_shrink(root);
> + deleted |= radix_tree_shrink(root, update_node,
> + private);
> return deleted;
> }
>
> @@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root,
>
> /**
> * __radix_tree_replace - replace item in a slot
> - * @root: radix tree root
> - * @node: pointer to tree node
> - * @slot: pointer to slot in @node
> - * @item: new item to store in the slot.
> + * @root: radix tree root
> + * @node: pointer to tree node
> + * @slot: pointer to slot in @node
> + * @item: new item to store in the slot.
> + * @update_node: callback for changing leaf nodes
> + * @private: private data to pass to @update_node
> *
> * For use with __radix_tree_lookup(). Caller must hold tree write locked
> * across slot lookup and replacement.
> */
> void __radix_tree_replace(struct radix_tree_root *root,
> struct radix_tree_node *node,
> - void **slot, void *item)
> + void **slot, void *item,
> + radix_tree_update_node_t update_node, void *private)
> {
> /*
> * This function supports replacing exceptional entries and
> @@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root,
> replace_slot(root, node, slot, item,
> !node && slot != (void **)&root->rnode);
>
> - delete_node(root, node);
> + if (!node)
> + return;
> +
> + if (update_node)
> + update_node(node, private);
> +
> + delete_node(root, node, update_node, private);
> }
>
> /**
> @@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
> bool __radix_tree_delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node)
> {
> - return delete_node(root, node);
> + return delete_node(root, node, NULL, NULL);
> }
>
> static inline void delete_sibling_entries(struct radix_tree_node *node,
> @@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
> node_tag_clear(root, node, tag, offset);
>
> delete_sibling_entries(node, node_to_entry(slot), offset);
> - __radix_tree_replace(root, node, slot, NULL);
> + __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
>
> return entry;
> }
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 7f3a08df25c9..62ac381069fc 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
> return -ENOENT;
> if (item != expected)
> return -ENOENT;
> - __radix_tree_replace(&mapping->page_tree, node, pslot, replacement);
> + __radix_tree_replace(&mapping->page_tree, node, pslot,
> + replacement, NULL, NULL);
> return 0;
> }
>
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:32:11, Johannes Weiner wrote:
> Currently, we track the shadow entries in the page cache in the upper
> bits of the radix_tree_node->count, behind the back of the radix tree
> implementation. Because the radix tree code has no awareness of them,
> we rely on random subtleties throughout the implementation (such as
> the node->count != 1 check in the shrinking code, which is meant to
> exclude multi-entry nodes but also happens to skip nodes with only one
> shadow entry, as that's accounted in the upper bits). This is error
> prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm:
> filemap: don't plant shadow entries without radix tree node").
>
> To remove these subtleties, this patch moves shadow entry tracking
> from the upper bits of node->count to the existing counter for
> exceptional entries. node->count goes back to being a simple counter
> of valid entries in the tree node and can be shrunk to a single byte.
>
> This vastly simplifies the page cache code. All accounting happens
> natively inside the radix tree implementation, and maintaining the LRU
> linkage of shadow nodes is consolidated into a single function in the
> workingset code that is called for leaf nodes affected by a change in
> the page cache tree.
>
> This also removes the last user of the __radix_delete_node() return
> value. Eliminate it.
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
>
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
> include/linux/radix-tree.h | 8 ++-----
> include/linux/swap.h | 34 +---------------------------
> lib/radix-tree.c | 25 +++++----------------
> mm/filemap.c | 54 +++++---------------------------------------
> mm/truncate.c | 21 +++--------------
> mm/workingset.c | 56 +++++++++++++++++++++++++++++++++++-----------
> 6 files changed, 60 insertions(+), 138 deletions(-)
>
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 15c972ea9510..744486057e9e 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr)
> #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
> RADIX_TREE_MAP_SHIFT))
>
> -/* Internally used bits of node->count */
> -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
> -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
> -
> struct radix_tree_node {
> unsigned char shift; /* Bits remaining in each slot */
> unsigned char offset; /* Slot offset in parent */
> - unsigned int count; /* Total entry count */
> + unsigned char count; /* Total entry count */
> unsigned char exceptional; /* Exceptional entry count */
> union {
> struct {
> @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root,
> radix_tree_update_node_t update_node, void *private);
> void radix_tree_replace_slot(struct radix_tree_root *root,
> void **slot, void *item);
> -bool __radix_tree_delete_node(struct radix_tree_root *root,
> +void __radix_tree_delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node);
> void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
> void *radix_tree_delete(struct radix_tree_root *, unsigned long);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index a56523cefb9b..09b212d37f1d 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -246,39 +246,7 @@ struct swap_info_struct {
> void *workingset_eviction(struct address_space *mapping, struct page *page);
> bool workingset_refault(void *shadow);
> void workingset_activation(struct page *page);
> -extern struct list_lru workingset_shadow_nodes;
> -
> -static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
> -{
> - return node->count & RADIX_TREE_COUNT_MASK;
> -}
> -
> -static inline void workingset_node_pages_inc(struct radix_tree_node *node)
> -{
> - node->count++;
> -}
> -
> -static inline void workingset_node_pages_dec(struct radix_tree_node *node)
> -{
> - VM_WARN_ON_ONCE(!workingset_node_pages(node));
> - node->count--;
> -}
> -
> -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
> -{
> - return node->count >> RADIX_TREE_COUNT_SHIFT;
> -}
> -
> -static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
> -{
> - node->count += 1U << RADIX_TREE_COUNT_SHIFT;
> -}
> -
> -static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
> -{
> - VM_WARN_ON_ONCE(!workingset_node_shadows(node));
> - node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
> -}
> +void workingset_update_node(struct radix_tree_node *node, void *private);
>
> /* linux/mm/page_alloc.c */
> extern unsigned long totalram_pages;
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index df4ff18dd63c..9dbfaac05e6c 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -541,12 +541,10 @@ static int radix_tree_extend(struct radix_tree_root *root,
> * radix_tree_shrink - shrink radix tree to minimum height
> * @root radix tree root
> */
> -static inline bool radix_tree_shrink(struct radix_tree_root *root,
> +static inline void radix_tree_shrink(struct radix_tree_root *root,
> radix_tree_update_node_t update_node,
> void *private)
> {
> - bool shrunk = false;
> -
> for (;;) {
> struct radix_tree_node *node = root->rnode;
> struct radix_tree_node *child;
> @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root,
> }
>
> radix_tree_node_free(node);
> - shrunk = true;
> }
> -
> - return shrunk;
> }
>
> -static bool delete_node(struct radix_tree_root *root,
> +static void delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node,
> radix_tree_update_node_t update_node, void *private)
> {
> - bool deleted = false;
> -
> do {
> struct radix_tree_node *parent;
>
> if (node->count) {
> if (node == entry_to_node(root->rnode))
> - deleted |= radix_tree_shrink(root, update_node,
> - private);
> - return deleted;
> + radix_tree_shrink(root, update_node, private);
> + return;
> }
>
> parent = node->parent;
> @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root,
> }
>
> radix_tree_node_free(node);
> - deleted = true;
>
> node = parent;
> } while (node);
> -
> - return deleted;
> }
>
> /**
> @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
> * After clearing the slot at @index in @node from radix tree
> * rooted at @root, call this function to attempt freeing the
> * node and shrinking the tree.
> - *
> - * Returns %true if @node was freed, %false otherwise.
> */
> -bool __radix_tree_delete_node(struct radix_tree_root *root,
> +void __radix_tree_delete_node(struct radix_tree_root *root,
> struct radix_tree_node *node)
> {
> - return delete_node(root, node, NULL, NULL);
> + delete_node(root, node, NULL, NULL);
> }
>
> static inline void delete_sibling_entries(struct radix_tree_node *node,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index eb463156f29a..7d92032277ff 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping,
> if (!dax_mapping(mapping)) {
> if (shadowp)
> *shadowp = p;
> - if (node)
> - workingset_node_shadows_dec(node);
> } else {
> /* DAX can replace empty locked entry with a hole */
> WARN_ON_ONCE(p !=
> (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
> RADIX_DAX_ENTRY_LOCK));
> - /* DAX accounts exceptional entries as normal pages */
> - if (node)
> - workingset_node_pages_dec(node);
> /* Wakeup waiters for exceptional entry lock */
> dax_wake_mapping_entry_waiter(mapping, page->index,
> false);
> }
> }
> - radix_tree_replace_slot(&mapping->page_tree, slot, page);
> + __radix_tree_replace(&mapping->page_tree, node, slot, page,
> + workingset_update_node, mapping);
> mapping->nrpages++;
> - if (node) {
> - workingset_node_pages_inc(node);
> - /*
> - * Don't track node that contains actual pages.
> - *
> - * Avoid acquiring the list_lru lock if already
> - * untracked. The list_empty() test is safe as
> - * node->private_list is protected by
> - * mapping->tree_lock.
> - */
> - if (!list_empty(&node->private_list))
> - list_lru_del(&workingset_shadow_nodes,
> - &node->private_list);
> - }
> return 0;
> }
>
> @@ -182,8 +164,6 @@ static void page_cache_tree_delete(struct address_space *mapping,
> __radix_tree_lookup(&mapping->page_tree, page->index + i,
> &node, &slot);
>
> - radix_tree_clear_tags(&mapping->page_tree, node, slot);
> -
> if (!node) {
> VM_BUG_ON_PAGE(nr != 1, page);
> /*
> @@ -193,33 +173,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
> shadow = NULL;
> }
>
> - radix_tree_replace_slot(&mapping->page_tree, slot, shadow);
> -
> - if (!node)
> - break;
> -
> - workingset_node_pages_dec(node);
> - if (shadow)
> - workingset_node_shadows_inc(node);
> - else
> - if (__radix_tree_delete_node(&mapping->page_tree, node))
> - continue;
> -
> - /*
> - * Track node that only contains shadow entries. DAX mappings
> - * contain no shadow entries and may contain other exceptional
> - * entries so skip those.
> - *
> - * Avoid acquiring the list_lru lock if already tracked.
> - * The list_empty() test is safe as node->private_list is
> - * protected by mapping->tree_lock.
> - */
> - if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
> - list_empty(&node->private_list)) {
> - node->private_data = mapping;
> - list_lru_add(&workingset_shadow_nodes,
> - &node->private_list);
> - }
> + radix_tree_clear_tags(&mapping->page_tree, node, slot);
> + __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
> + workingset_update_node, mapping);
> }
>
> if (shadow) {
> diff --git a/mm/truncate.c b/mm/truncate.c
> index 6ae44571d4c7..7e5464a611db 100644
> --- a/mm/truncate.c
> +++ b/mm/truncate.c
> @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
> * without the tree itself locked. These unlocked entries
> * need verification under the tree lock.
> */
> - if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
> - &slot))
> + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
> goto unlock;
> if (*slot != entry)
> goto unlock;
> - radix_tree_replace_slot(&mapping->page_tree, slot, NULL);
> + __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
> + workingset_update_node, mapping);
> mapping->nrexceptional--;
> - if (!node)
> - goto unlock;
> - workingset_node_shadows_dec(node);
> - /*
> - * Don't track node without shadow entries.
> - *
> - * Avoid acquiring the list_lru lock if already untracked.
> - * The list_empty() test is safe as node->private_list is
> - * protected by mapping->tree_lock.
> - */
> - if (!workingset_node_shadows(node) &&
> - !list_empty(&node->private_list))
> - list_lru_del(&workingset_shadow_nodes,
> - &node->private_list);
> - __radix_tree_delete_node(&mapping->page_tree, node);
> unlock:
> spin_unlock_irq(&mapping->tree_lock);
> }
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 3cfc61d84a52..111e06559892 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -10,6 +10,7 @@
> #include <linux/atomic.h>
> #include <linux/module.h>
> #include <linux/swap.h>
> +#include <linux/dax.h>
> #include <linux/fs.h>
> #include <linux/mm.h>
>
> @@ -334,18 +335,45 @@ void workingset_activation(struct page *page)
> * point where they would still be useful.
> */
>
> -struct list_lru workingset_shadow_nodes;
> +static struct list_lru shadow_nodes;
> +
> +void workingset_update_node(struct radix_tree_node *node, void *private)
> +{
> + struct address_space *mapping = private;
> +
> + /* Only regular page cache has shadow entries */
> + if (dax_mapping(mapping) || shmem_mapping(mapping))
> + return;
> +
> + /*
> + * Track non-empty nodes that contain only shadow entries;
> + * unlink those that contain pages or are being freed.
> + *
> + * Avoid acquiring the list_lru lock when the nodes are
> + * already where they should be. The list_empty() test is safe
> + * as node->private_list is protected by &mapping->tree_lock.
> + */
> + if (node->count && node->count == node->exceptional) {
> + if (list_empty(&node->private_list)) {
> + node->private_data = mapping;
> + list_lru_add(&shadow_nodes, &node->private_list);
> + }
> + } else {
> + if (!list_empty(&node->private_list))
> + list_lru_del(&shadow_nodes, &node->private_list);
> + }
> +}
>
> static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> struct shrink_control *sc)
> {
> - unsigned long shadow_nodes;
> unsigned long max_nodes;
> + unsigned long nodes;
> unsigned long pages;
>
> /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
> local_irq_disable();
> - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
> + nodes = list_lru_shrink_count(&shadow_nodes, sc);
> local_irq_enable();
>
> if (memcg_kmem_enabled()) {
> @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> */
> max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
>
> - if (shadow_nodes <= max_nodes)
> + if (nodes <= max_nodes)
> return 0;
>
> - return shadow_nodes - max_nodes;
> + return nodes - max_nodes;
> }
>
> static enum lru_status shadow_lru_isolate(struct list_head *item,
> @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
> * no pages, so we expect to be able to remove them all and
> * delete and free the empty node afterwards.
> */
> - if (WARN_ON_ONCE(!workingset_node_shadows(node)))
> + if (WARN_ON_ONCE(!node->exceptional))
> goto out_invalid;
> - if (WARN_ON_ONCE(workingset_node_pages(node)))
> + if (WARN_ON_ONCE(node->count != node->exceptional))
> goto out_invalid;
> for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
> if (node->slots[i]) {
> if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
> goto out_invalid;
> + if (WARN_ON_ONCE(!node->exceptional))
> + goto out_invalid;
> if (WARN_ON_ONCE(!mapping->nrexceptional))
> goto out_invalid;
> node->slots[i] = NULL;
> - workingset_node_shadows_dec(node);
> + node->exceptional--;
> + node->count--;
> mapping->nrexceptional--;
> }
> }
> - if (WARN_ON_ONCE(workingset_node_shadows(node)))
> + if (WARN_ON_ONCE(node->exceptional))
> goto out_invalid;
> inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
> __radix_tree_delete_node(&mapping->page_tree, node);
> @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
>
> /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
> local_irq_disable();
> - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
> - shadow_lru_isolate, NULL);
> + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
> local_irq_enable();
> return ret;
> }
> @@ -496,7 +526,7 @@ static int __init workingset_init(void)
> pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
> timestamp_bits, max_order, bucket_order);
>
> - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
> + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
> if (ret)
> goto err;
> ret = register_shrinker(&workingset_shadow_shrinker);
> @@ -504,7 +534,7 @@ static int __init workingset_init(void)
> goto err_list_lru;
> return 0;
> err_list_lru:
> - list_lru_destroy(&workingset_shadow_nodes);
> + list_lru_destroy(&shadow_nodes);
> err:
> return ret;
> }
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR
On Thu 17-11-16 14:32:44, Johannes Weiner wrote:
> Shadow entries in the page cache used to be accounted behind the radix
> tree implementation's back in the upper bits of node->count, and the
> radix tree code extending a single-entry tree with a shadow entry in
> root->rnode would corrupt that counter. As a result, we could not put
> shadow entries at index 0 if the tree didn't have any other entries,
> and that means no refault detection for any single-page file.
>
> Now that the shadow entries are tracked natively in the radix tree's
> exceptional counter, this is no longer necessary. Extending and
> shrinking the tree from and to single entries in root->rnode now does
> the right thing when the entry is exceptional, remove that limitation.
>
> Signed-off-by: Johannes Weiner <[email protected]>
Looks good. You can add:
Reviewed-by: Jan Kara <[email protected]>
Honza
> ---
> mm/filemap.c | 9 +--------
> 1 file changed, 1 insertion(+), 8 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 7d92032277ff..ae7b6992aded 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -164,14 +164,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
> __radix_tree_lookup(&mapping->page_tree, page->index + i,
> &node, &slot);
>
> - if (!node) {
> - VM_BUG_ON_PAGE(nr != 1, page);
> - /*
> - * We need a node to properly account shadow
> - * entries. Don't plant any without. XXX
> - */
> - shadow = NULL;
> - }
> + VM_BUG_ON_PAGE(!node && nr != 1, page);
>
> radix_tree_clear_tags(&mapping->page_tree, node, slot);
> __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
> --
> 2.10.2
>
--
Jan Kara <[email protected]>
SUSE Labs, CR