Hi folks,
These are some cleanups and fixes for the compressed part I'd like to
aim for the next cycle. I will send several versions if there are more
patches available. This ongoing cleanup work are also for later folio
adaption.
I've set up a stress test for this patchset at the same time.
Thanks,
Gao Xiang
Gao Xiang (6):
erofs: allocate extra bvec pages directly instead of retrying
erofs: avoid on-stack pagepool directly passed by arguments
erofs: kill hooked chains to avoid loops on deduplicated compressed
images
erofs: adapt managed inode operations into folios
erofs: use struct lockref to replace handcrafted approach
erofs: use poison pointer to replace the hard-coded address
fs/erofs/internal.h | 41 +-------
fs/erofs/super.c | 62 ------------
fs/erofs/utils.c | 87 ++++++++--------
fs/erofs/zdata.c | 238 ++++++++++++++++++++------------------------
4 files changed, 156 insertions(+), 272 deletions(-)
--
2.24.4
It's safer and cleaner to replace such hard-coded illegal pointer
with poison pointers.
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/zdata.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 2ea8e7f08372..83df1954b859 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -91,10 +91,8 @@ struct z_erofs_pcluster {
struct z_erofs_bvec compressed_bvecs[];
};
-/* let's avoid the valid 32-bit kernel addresses */
-
/* the end of a chain of pclusters */
-#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
+#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
--
2.24.4
On-stack pagepool is used so that short-lived temporary pages could be
shared within a single I/O request (e.g. among multiple pclusters).
Moving the remaining frontend-related uses into
z_erofs_decompress_frontend to avoid too many arguments.
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/zdata.c | 64 +++++++++++++++++++++++-------------------------
1 file changed, 30 insertions(+), 34 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 59dc2537af00..a67f4ac19c48 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -240,13 +240,14 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct z_erofs_bvec *bvec,
- struct page **candidate_bvpage)
+ struct page **candidate_bvpage,
+ struct page **pagepool)
{
if (iter->cur >= iter->nr) {
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = alloc_page(GFP_NOFS);
+ nextpage = erofs_allocpage(pagepool, GFP_NOFS);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -549,6 +550,7 @@ struct z_erofs_decompress_frontend {
struct erofs_map_blocks map;
struct z_erofs_bvec_iter biter;
+ struct page *pagepool;
struct page *candidate_bvpage;
struct z_erofs_pcluster *pcl, *tailpcl;
z_erofs_next_pcluster_t owned_head;
@@ -583,8 +585,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
return false;
}
-static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
@@ -625,7 +626,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
* succeeds or fallback to in-place I/O instead
* to avoid any direct reclaim.
*/
- newpage = erofs_allocpage(pagepool, gfp);
+ newpage = erofs_allocpage(&fe->pagepool, gfp);
if (!newpage)
continue;
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
@@ -638,7 +639,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
if (page)
put_page(page);
else if (newpage)
- erofs_pagepool_add(pagepool, newpage);
+ erofs_pagepool_add(&fe->pagepool, newpage);
}
/*
@@ -736,7 +737,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
!fe->candidate_bvpage)
fe->candidate_bvpage = bvec->page;
}
- ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
+ &fe->pagepool);
fe->pcl->vcnt += (ret >= 0);
return ret;
}
@@ -961,7 +963,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, struct page **pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
@@ -1019,7 +1021,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe, pagepool);
+ z_erofs_bind_cache(fe);
}
hitted:
/*
@@ -1662,7 +1664,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
}
static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg, bool readahead)
{
@@ -1725,8 +1726,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
do {
struct page *page;
- page = pickup_page_for_submission(pcl, i++, pagepool,
- mc);
+ page = pickup_page_for_submission(pcl, i++,
+ &f->pagepool, mc);
if (!page)
continue;
@@ -1791,16 +1792,16 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
}
static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
- struct page **pagepool, bool force_fg, bool ra)
+ bool force_fg, bool ra)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(f, pagepool, io, &force_fg, ra);
+ z_erofs_submit_queue(f, io, &force_fg, ra);
/* handle bypass queue (no i/o pclusters) immediately */
- z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
+ z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
if (!force_fg)
return;
@@ -1809,7 +1810,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
- z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+ z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
}
/*
@@ -1817,8 +1818,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
* approximate readmore strategies as a start.
*/
static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
- struct readahead_control *rac,
- struct page **pagepool, bool backmost)
+ struct readahead_control *rac, bool backmost)
{
struct inode *inode = f->inode;
struct erofs_map_blocks *map = &f->map;
@@ -1860,7 +1860,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page)) {
unlock_page(page);
} else {
- err = z_erofs_do_read_page(f, page, pagepool);
+ err = z_erofs_do_read_page(f, page);
if (err)
erofs_err(inode->i_sb,
"readmore error at page %lu @ nid %llu",
@@ -1881,27 +1881,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
struct inode *const inode = page->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL;
int err;
trace_erofs_readpage(page, false);
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- z_erofs_pcluster_readmore(&f, NULL, &pagepool, true);
- err = z_erofs_do_read_page(&f, page, &pagepool);
- z_erofs_pcluster_readmore(&f, NULL, &pagepool, false);
-
+ z_erofs_pcluster_readmore(&f, NULL, true);
+ err = z_erofs_do_read_page(&f, page);
+ z_erofs_pcluster_readmore(&f, NULL, false);
(void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(&f, &pagepool, z_erofs_is_sync_decompress(sbi, 0),
- false);
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
return err;
}
@@ -1910,12 +1907,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *pagepool = NULL, *head = NULL, *page;
+ struct page *head = NULL, *page;
unsigned int nr_pages;
f.headoffset = readahead_pos(rac);
- z_erofs_pcluster_readmore(&f, rac, &pagepool, true);
+ z_erofs_pcluster_readmore(&f, rac, true);
nr_pages = readahead_count(rac);
trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
@@ -1931,20 +1928,19 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
page->index, EROFS_I(inode)->nid);
put_page(page);
}
- z_erofs_pcluster_readmore(&f, rac, &pagepool, false);
+ z_erofs_pcluster_readmore(&f, rac, false);
(void)z_erofs_collector_end(&f);
- z_erofs_runqueue(&f, &pagepool,
- z_erofs_is_sync_decompress(sbi, nr_pages), true);
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
erofs_put_metabuf(&f.map.buf);
- erofs_release_pages(&pagepool);
+ erofs_release_pages(&f.pagepool);
}
const struct address_space_operations z_erofs_aops = {
--
2.24.4
Let's avoid the current handcrafted lockref although `struct lockref`
inclusion usually increases extra 4 bytes with an explicit spinlock if
CONFIG_DEBUG_SPINLOCK is off.
Apart from the size difference, note that the meaning of refcount is
also changed to active users. IOWs, it doesn't take an extra refcount
for XArray tree insertion.
I don't observe any significant performance difference at least on
our cloud compute server but the new one indeed simplifies the
overall codebase a bit.
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/internal.h | 38 ++------------------
fs/erofs/utils.c | 87 ++++++++++++++++++++++-----------------------
fs/erofs/zdata.c | 15 ++++----
3 files changed, 53 insertions(+), 87 deletions(-)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 0b8506c39145..e63f6cd424a0 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -208,46 +208,12 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
-
/* basic unit of the workstation of a super_block */
struct erofs_workgroup {
- /* the workgroup index in the workstation */
pgoff_t index;
-
- /* overall workgroup reference count */
- atomic_t refcount;
+ struct lockref lockref;
};
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- /*
- * other observers should notice all modifications
- * in the freezing period.
- */
- smp_mb();
- atomic_set(&grp->refcount, orig_val);
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- return atomic_cond_read_relaxed(&grp->refcount,
- VAL != EROFS_LOCKED_MAGIC);
-}
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -492,7 +458,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-int erofs_workgroup_put(struct erofs_workgroup *grp);
+void erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 46627cb69abe..6895680e1372 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -33,22 +33,21 @@ void erofs_release_pages(struct page **pagepool)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-static int erofs_workgroup_get(struct erofs_workgroup *grp)
+static bool erofs_workgroup_get(struct erofs_workgroup *grp)
{
- int o;
+ if (lockref_get_not_zero(&grp->lockref))
+ return true;
-repeat:
- o = erofs_wait_on_workgroup_freezed(grp);
- if (o <= 0)
- return -1;
-
- if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
- goto repeat;
+ spin_lock(&grp->lockref.lock);
+ if (__lockref_is_dead(&grp->lockref)) {
+ spin_unlock(&grp->lockref.lock);
+ return false;
+ }
- /* decrease refcount paired by erofs_workgroup_put */
- if (o == 1)
+ if (!grp->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
- return 0;
+ spin_unlock(&grp->lockref.lock);
+ return true;
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
@@ -61,7 +60,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
rcu_read_lock();
grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
- if (erofs_workgroup_get(grp)) {
+ if (!erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
goto repeat;
@@ -80,11 +79,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *pre;
/*
- * Bump up a reference count before making this visible
- * to others for the XArray in order to avoid potential
- * UAF without serialized by xa_lock.
+ * Bump up before making this visible to others for the XArray in order
+ * to avoid potential UAF without serialized by xa_lock.
*/
- atomic_inc(&grp->refcount);
+ lockref_get(&grp->lockref);
repeat:
xa_lock(&sbi->managed_pslots);
@@ -93,13 +91,13 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
- } else if (erofs_workgroup_get(pre)) {
+ } else if (!erofs_workgroup_get(pre)) {
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
goto repeat;
}
- atomic_dec(&grp->refcount);
+ lockref_put_return(&grp->lockref);
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
@@ -112,38 +110,36 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
erofs_workgroup_free_rcu(grp);
}
-int erofs_workgroup_put(struct erofs_workgroup *grp)
+void erofs_workgroup_put(struct erofs_workgroup *grp)
{
- int count = atomic_dec_return(&grp->refcount);
+ if (lockref_put_not_zero(&grp->lockref))
+ return;
- if (count == 1)
+ spin_lock(&grp->lockref.lock);
+ DBG_BUGON(__lockref_is_dead(&grp->lockref));
+ if (grp->lockref.count == 1) {
atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count)
- __erofs_workgroup_free(grp);
- return count;
+ --grp->lockref.count;
+ }
+ spin_unlock(&grp->lockref.lock);
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
- /*
- * If managed cache is on, refcount of workgroups
- * themselves could be < 0 (freezed). In other words,
- * there is no guarantee that all refcounts > 0.
- */
- if (!erofs_workgroup_try_to_freeze(grp, 1))
- return false;
+ int free = false;
+
+ spin_lock(&grp->lockref.lock);
+ if (grp->lockref.count)
+ goto out;
/*
- * Note that all cached pages should be unattached
- * before deleted from the XArray. Otherwise some
- * cached pages could be still attached to the orphan
- * old workgroup when the new one is available in the tree.
+ * Note that all cached pages should be detached before deleted from
+ * the XArray. Otherwise some cached pages could be still attached to
+ * the orphan old workgroup when the new one is available in the tree.
*/
- if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
- erofs_workgroup_unfreeze(grp, 1);
- return false;
- }
+ if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ goto out;
/*
* It's impossible to fail after the workgroup is freezed,
@@ -152,10 +148,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
- /* last refcount should be connected with its managed pslot. */
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
- return true;
+ lockref_mark_dead(&grp->lockref);
+ free = true;
+out:
+ spin_unlock(&grp->lockref.lock);
+ if (free)
+ __erofs_workgroup_free(grp);
+ return free;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 15a383899540..2ea8e7f08372 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -643,7 +643,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
- * refcount of workgroup is now freezed as 1,
+ * refcount of workgroup is now freezed as 0,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
@@ -676,10 +676,11 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
if (!folio_test_private(folio))
return true;
- if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return false;
-
ret = false;
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->obj.lockref.count > 0)
+ goto out;
+
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page == &folio->page) {
@@ -688,10 +689,10 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
break;
}
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
-
if (ret)
folio_detach_private(folio);
+out:
+ spin_unlock(&pcl->obj.lockref.lock);
return ret;
}
@@ -807,7 +808,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- atomic_set(&pcl->obj.refcount, 1);
+ spin_lock_init(&pcl->obj.lockref.lock);
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
--
2.24.4
After heavily stressing EROFS with several images which include a
hand-crafted image of repeated patterns for more than 46 days, I found
two chains could be linked with each other almost simultaneously and
form a loop so that the entire loop won't be submitted. As a
consequence, the corresponding file pages will remain locked forever.
It can be _only_ observed on data-deduplicated compressed images.
For example, consider two chains with five pclusters in total:
Chain 1: 2->3->4->5 -- The tail pcluster is 5;
Chain 2: 5->1->2 -- The tail pcluster is 2.
Chain 2 could link to Chain 1 with pcluster 5; and Chain 1 could link
to Chain 2 at the same time with pcluster 2.
Since hooked chains are all linked locklessly now, I have no idea how
to simply avoid the race. Instead, let's avoid hooked chains completely
until I could work out a proper way to fix this and end users finally
tell us that it's needed to add it back.
Actually, this optimization can be found with multi-threaded workloads
(especially even more often on deduplicated compressed images), yet I'm
not sure about the overall system impacts of not having this compared
with implementation complexity.
Fixes: 267f2492c8f7 ("erofs: introduce multi-reference pclusters (fully-referenced)")
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/zdata.c | 72 ++++++++----------------------------------------
1 file changed, 11 insertions(+), 61 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a67f4ac19c48..76488824f146 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -93,11 +93,8 @@ struct z_erofs_pcluster {
/* let's avoid the valid 32-bit kernel addresses */
-/* the chained workgroup has't submitted io (still open) */
+/* the end of a chain of pclusters */
#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
-/* the chained workgroup has already submitted io */
-#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
-
#define Z_EROFS_PCLUSTER_NIL (NULL)
struct z_erofs_decompressqueue {
@@ -506,20 +503,6 @@ int __init z_erofs_init_zip_subsystem(void)
enum z_erofs_pclustermode {
Z_EROFS_PCLUSTER_INFLIGHT,
- /*
- * The current pclusters was the tail of an exist chain, in addition
- * that the previous processed chained pclusters are all decided to
- * be hooked up to it.
- * A new chain will be created for the remaining pclusters which are
- * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
- * the next pcluster cannot reuse the whole page safely for inplace I/O
- * in the following scenario:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (belongs to the next pcl) | (belongs to the current pcl) |
- * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
- */
- Z_EROFS_PCLUSTER_HOOKED,
/*
* a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed
@@ -537,8 +520,8 @@ enum z_erofs_pclustermode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PCLUSTER_FOLLOWED or | |
- * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
+ * | | |
+ * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
@@ -552,7 +535,7 @@ struct z_erofs_decompress_frontend {
struct page *pagepool;
struct page *candidate_bvpage;
- struct z_erofs_pcluster *pcl, *tailpcl;
+ struct z_erofs_pcluster *pcl;
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
@@ -757,19 +740,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
return;
}
- /*
- * type 2, link to the end of an existing open chain, be careful
- * that its submission is controlled by the original attached chain.
- */
- if (*owned_head != &pcl->next && pcl != f->tailpcl &&
- cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
- *owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = Z_EROFS_PCLUSTER_HOOKED;
- f->tailpcl = NULL;
- return;
- }
- /* type 3, it belongs to a chain, but it isn't the end of the chain */
+ /* type 2, it belongs to an ongoing chain */
f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}
@@ -830,9 +801,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
goto err_out;
}
}
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
fe->owned_head = &pcl->next;
fe->pcl = pcl;
return 0;
@@ -853,7 +821,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (!(map->m_flags & EROFS_MAP_META)) {
grp = erofs_find_workgroup(fe->inode->i_sb,
@@ -872,10 +839,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = fe->pcl;
-
z_erofs_try_to_claim_pcluster(fe);
} else if (ret) {
return ret;
@@ -1030,8 +993,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
- fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -1400,10 +1362,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
};
z_erofs_next_pcluster_t owned = io->head;
- while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
- DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
- /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
+ while (owned != Z_EROFS_PCLUSTER_TAIL) {
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
be.pcl = container_of(owned, struct z_erofs_pcluster, next);
@@ -1420,7 +1379,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
container_of(work, struct z_erofs_decompressqueue, u.work);
struct page *pagepool = NULL;
- DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
z_erofs_decompress_queue(bgq, &pagepool);
erofs_release_pages(&pagepool);
kvfree(bgq);
@@ -1608,7 +1567,7 @@ static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
q->sync = true;
}
q->sb = sb;
- q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+ q->head = Z_EROFS_PCLUSTER_TAIL;
return q;
}
@@ -1626,11 +1585,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (owned_head == Z_EROFS_PCLUSTER_TAIL)
- owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
-
- WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
WRITE_ONCE(*submit_qtail, owned_head);
WRITE_ONCE(*bypass_qtail, &pcl->next);
@@ -1700,15 +1655,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
unsigned int i = 0;
bool bypass = true;
- /* no possible 'owned_head' equals the following */
- DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
-
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+ owned_head = READ_ONCE(pcl->next);
- /* close the main owned chain at first */
- owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (z_erofs_is_inline_pcluster(pcl)) {
move_to_bypass_jobqueue(pcl, qtail, owned_head);
continue;
--
2.24.4
This patch gets rid of erofs_try_to_free_cached_page() and fold it
into .release_folio().
It also moves managed inode operations into zdata.c, which simplifies
the code a bit. No logic changes.
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/internal.h | 3 ++-
fs/erofs/super.c | 62 ---------------------------------------------
fs/erofs/zdata.c | 59 ++++++++++++++++++++++++++++++++++++------
3 files changed, 53 insertions(+), 71 deletions(-)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index af0431a40647..0b8506c39145 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -506,12 +506,12 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct page *page);
int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb,
struct z_erofs_lz4_cfgs *lz4, int len);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
int flags);
+int erofs_init_managed_cache(struct super_block *sb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -529,6 +529,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
}
return 0;
}
+static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
#ifdef CONFIG_EROFS_FS_ZIP_LZMA
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 811ab66d805e..c2829c91812b 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -599,68 +599,6 @@ static int erofs_fc_parse_param(struct fs_context *fc,
return 0;
}
-#ifdef CONFIG_EROFS_FS_ZIP
-static const struct address_space_operations managed_cache_aops;
-
-static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
-{
- bool ret = true;
- struct address_space *const mapping = folio->mapping;
-
- DBG_BUGON(!folio_test_locked(folio));
- DBG_BUGON(mapping->a_ops != &managed_cache_aops);
-
- if (folio_test_private(folio))
- ret = erofs_try_to_free_cached_page(&folio->page);
-
- return ret;
-}
-
-/*
- * It will be called only on inode eviction. In case that there are still some
- * decompression requests in progress, wait with rescheduling for a bit here.
- * We could introduce an extra locking instead but it seems unnecessary.
- */
-static void erofs_managed_cache_invalidate_folio(struct folio *folio,
- size_t offset, size_t length)
-{
- const size_t stop = length + offset;
-
- DBG_BUGON(!folio_test_locked(folio));
-
- /* Check for potential overflow in debug mode */
- DBG_BUGON(stop > folio_size(folio) || stop < length);
-
- if (offset == 0 && stop == folio_size(folio))
- while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
- cond_resched();
-}
-
-static const struct address_space_operations managed_cache_aops = {
- .release_folio = erofs_managed_cache_release_folio,
- .invalidate_folio = erofs_managed_cache_invalidate_folio,
-};
-
-static int erofs_init_managed_cache(struct super_block *sb)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- struct inode *const inode = new_inode(sb);
-
- if (!inode)
- return -ENOMEM;
-
- set_nlink(inode, 1);
- inode->i_size = OFFSET_MAX;
-
- inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- sbi->managed_cache = inode;
- return 0;
-}
-#else
-static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
-#endif
-
static struct inode *erofs_nfs_get_inode(struct super_block *sb,
u64 ino, u32 generation)
{
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 76488824f146..15a383899540 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -667,29 +667,72 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
return 0;
}
-int erofs_try_to_free_cached_page(struct page *page)
+static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- int ret, i;
+ struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ bool ret;
+ int i;
+
+ if (!folio_test_private(folio))
+ return true;
if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return 0;
+ return false;
- ret = 0;
+ ret = false;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_bvecs[i].page == page) {
+ if (pcl->compressed_bvecs[i].page == &folio->page) {
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
- ret = 1;
+ ret = true;
break;
}
}
erofs_workgroup_unfreeze(&pcl->obj, 1);
+
if (ret)
- detach_page_private(page);
+ folio_detach_private(folio);
return ret;
}
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * An extra lock could be introduced instead but it seems unnecessary.
+ */
+static void z_erofs_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
+{
+ const size_t stop = length + offset;
+
+ /* Check for potential overflow in debug mode */
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
+
+ if (offset == 0 && stop == folio_size(folio))
+ while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ cond_resched();
+}
+
+static const struct address_space_operations z_erofs_cache_aops = {
+ .release_folio = z_erofs_cache_release_folio,
+ .invalidate_folio = z_erofs_cache_invalidate_folio,
+};
+
+int erofs_init_managed_cache(struct super_block *sb)
+{
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode)
+ return -ENOMEM;
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &z_erofs_cache_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ EROFS_SB(sb)->managed_cache = inode;
+ return 0;
+}
+
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec)
{
--
2.24.4
Let's avoid the current handcrafted lockref although `struct lockref`
inclusion usually increases extra 4 bytes with an explicit spinlock if
CONFIG_DEBUG_SPINLOCK is off.
Apart from the size difference, note that the meaning of refcount is
also changed to active users. IOWs, it doesn't take an extra refcount
for XArray tree insertion.
I don't observe any significant performance difference at least on
our cloud compute server but the new one indeed simplifies the
overall codebase a bit.
Signed-off-by: Gao Xiang <[email protected]>
---
changes since v1:
- fix reference leaking due to improper fallback of
erofs_workgroup_put().
fs/erofs/internal.h | 38 ++------------------
fs/erofs/utils.c | 86 ++++++++++++++++++++++-----------------------
fs/erofs/zdata.c | 15 ++++----
3 files changed, 52 insertions(+), 87 deletions(-)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 0b8506c39145..e63f6cd424a0 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -208,46 +208,12 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
-
/* basic unit of the workstation of a super_block */
struct erofs_workgroup {
- /* the workgroup index in the workstation */
pgoff_t index;
-
- /* overall workgroup reference count */
- atomic_t refcount;
+ struct lockref lockref;
};
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- /*
- * other observers should notice all modifications
- * in the freezing period.
- */
- smp_mb();
- atomic_set(&grp->refcount, orig_val);
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- return atomic_cond_read_relaxed(&grp->refcount,
- VAL != EROFS_LOCKED_MAGIC);
-}
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -492,7 +458,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-int erofs_workgroup_put(struct erofs_workgroup *grp);
+void erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 46627cb69abe..6ed79f10e2e2 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -33,22 +33,21 @@ void erofs_release_pages(struct page **pagepool)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-static int erofs_workgroup_get(struct erofs_workgroup *grp)
+static bool erofs_workgroup_get(struct erofs_workgroup *grp)
{
- int o;
+ if (lockref_get_not_zero(&grp->lockref))
+ return true;
-repeat:
- o = erofs_wait_on_workgroup_freezed(grp);
- if (o <= 0)
- return -1;
-
- if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
- goto repeat;
+ spin_lock(&grp->lockref.lock);
+ if (__lockref_is_dead(&grp->lockref)) {
+ spin_unlock(&grp->lockref.lock);
+ return false;
+ }
- /* decrease refcount paired by erofs_workgroup_put */
- if (o == 1)
+ if (!grp->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
- return 0;
+ spin_unlock(&grp->lockref.lock);
+ return true;
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
@@ -61,7 +60,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
rcu_read_lock();
grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
- if (erofs_workgroup_get(grp)) {
+ if (!erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
goto repeat;
@@ -80,11 +79,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *pre;
/*
- * Bump up a reference count before making this visible
- * to others for the XArray in order to avoid potential
- * UAF without serialized by xa_lock.
+ * Bump up before making this visible to others for the XArray in order
+ * to avoid potential UAF without serialized by xa_lock.
*/
- atomic_inc(&grp->refcount);
+ lockref_get(&grp->lockref);
repeat:
xa_lock(&sbi->managed_pslots);
@@ -93,13 +91,13 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
- } else if (erofs_workgroup_get(pre)) {
+ } else if (!erofs_workgroup_get(pre)) {
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
goto repeat;
}
- atomic_dec(&grp->refcount);
+ lockref_put_return(&grp->lockref);
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
@@ -112,38 +110,35 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
erofs_workgroup_free_rcu(grp);
}
-int erofs_workgroup_put(struct erofs_workgroup *grp)
+void erofs_workgroup_put(struct erofs_workgroup *grp)
{
- int count = atomic_dec_return(&grp->refcount);
+ if (lockref_put_not_zero(&grp->lockref))
+ return;
- if (count == 1)
+ spin_lock(&grp->lockref.lock);
+ DBG_BUGON(__lockref_is_dead(&grp->lockref));
+ if (grp->lockref.count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count)
- __erofs_workgroup_free(grp);
- return count;
+ --grp->lockref.count;
+ spin_unlock(&grp->lockref.lock);
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
- /*
- * If managed cache is on, refcount of workgroups
- * themselves could be < 0 (freezed). In other words,
- * there is no guarantee that all refcounts > 0.
- */
- if (!erofs_workgroup_try_to_freeze(grp, 1))
- return false;
+ int free = false;
+
+ spin_lock(&grp->lockref.lock);
+ if (grp->lockref.count)
+ goto out;
/*
- * Note that all cached pages should be unattached
- * before deleted from the XArray. Otherwise some
- * cached pages could be still attached to the orphan
- * old workgroup when the new one is available in the tree.
+ * Note that all cached pages should be detached before deleted from
+ * the XArray. Otherwise some cached pages could be still attached to
+ * the orphan old workgroup when the new one is available in the tree.
*/
- if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
- erofs_workgroup_unfreeze(grp, 1);
- return false;
- }
+ if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ goto out;
/*
* It's impossible to fail after the workgroup is freezed,
@@ -152,10 +147,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
- /* last refcount should be connected with its managed pslot. */
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
- return true;
+ lockref_mark_dead(&grp->lockref);
+ free = true;
+out:
+ spin_unlock(&grp->lockref.lock);
+ if (free)
+ __erofs_workgroup_free(grp);
+ return free;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 15a383899540..2ea8e7f08372 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -643,7 +643,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
- * refcount of workgroup is now freezed as 1,
+ * refcount of workgroup is now freezed as 0,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
@@ -676,10 +676,11 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
if (!folio_test_private(folio))
return true;
- if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return false;
-
ret = false;
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->obj.lockref.count > 0)
+ goto out;
+
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page == &folio->page) {
@@ -688,10 +689,10 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
break;
}
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
-
if (ret)
folio_detach_private(folio);
+out:
+ spin_unlock(&pcl->obj.lockref.lock);
return ret;
}
@@ -807,7 +808,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- atomic_set(&pcl->obj.refcount, 1);
+ spin_lock_init(&pcl->obj.lockref.lock);
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
--
2.24.4
On Mon, 29 May 2023 15:29:23 +0800
Gao Xiang <[email protected]> wrote:
> Let's avoid the current handcrafted lockref although `struct lockref`
> inclusion usually increases extra 4 bytes with an explicit spinlock if
> CONFIG_DEBUG_SPINLOCK is off.
>
> Apart from the size difference, note that the meaning of refcount is
> also changed to active users. IOWs, it doesn't take an extra refcount
> for XArray tree insertion.
>
> I don't observe any significant performance difference at least on
> our cloud compute server but the new one indeed simplifies the
> overall codebase a bit.
>
> Signed-off-by: Gao Xiang <[email protected]>
> ---
> changes since v1:
> - fix reference leaking due to improper fallback of
> erofs_workgroup_put().
>
> fs/erofs/internal.h | 38 ++------------------
> fs/erofs/utils.c | 86 ++++++++++++++++++++++-----------------------
> fs/erofs/zdata.c | 15 ++++----
> 3 files changed, 52 insertions(+), 87 deletions(-)
>
> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
> index 0b8506c39145..e63f6cd424a0 100644
> --- a/fs/erofs/internal.h
> +++ b/fs/erofs/internal.h
> @@ -208,46 +208,12 @@ enum {
> EROFS_ZIP_CACHE_READAROUND
> };
>
> -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
> -
> /* basic unit of the workstation of a super_block */
> struct erofs_workgroup {
> - /* the workgroup index in the workstation */
> pgoff_t index;
> -
> - /* overall workgroup reference count */
> - atomic_t refcount;
> + struct lockref lockref;
> };
>
> -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
> - int val)
> -{
> - preempt_disable();
> - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
> - preempt_enable();
> - return false;
> - }
> - return true;
> -}
> -
> -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
> - int orig_val)
> -{
> - /*
> - * other observers should notice all modifications
> - * in the freezing period.
> - */
> - smp_mb();
> - atomic_set(&grp->refcount, orig_val);
> - preempt_enable();
> -}
> -
> -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
> -{
> - return atomic_cond_read_relaxed(&grp->refcount,
> - VAL != EROFS_LOCKED_MAGIC);
> -}
> -
> enum erofs_kmap_type {
> EROFS_NO_KMAP, /* don't map the buffer */
> EROFS_KMAP, /* use kmap_local_page() to map the buffer */
> @@ -492,7 +458,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
> void erofs_release_pages(struct page **pagepool);
>
> #ifdef CONFIG_EROFS_FS_ZIP
> -int erofs_workgroup_put(struct erofs_workgroup *grp);
> +void erofs_workgroup_put(struct erofs_workgroup *grp);
> struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
> pgoff_t index);
> struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
> diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
> index 46627cb69abe..6ed79f10e2e2 100644
> --- a/fs/erofs/utils.c
> +++ b/fs/erofs/utils.c
> @@ -33,22 +33,21 @@ void erofs_release_pages(struct page **pagepool)
> /* global shrink count (for all mounted EROFS instances) */
> static atomic_long_t erofs_global_shrink_cnt;
>
> -static int erofs_workgroup_get(struct erofs_workgroup *grp)
> +static bool erofs_workgroup_get(struct erofs_workgroup *grp)
> {
> - int o;
> + if (lockref_get_not_zero(&grp->lockref))
> + return true;
>
> -repeat:
> - o = erofs_wait_on_workgroup_freezed(grp);
> - if (o <= 0)
> - return -1;
> -
> - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
> - goto repeat;
> + spin_lock(&grp->lockref.lock);
> + if (__lockref_is_dead(&grp->lockref)) {
> + spin_unlock(&grp->lockref.lock);
> + return false;
> + }
>
> - /* decrease refcount paired by erofs_workgroup_put */
> - if (o == 1)
> + if (!grp->lockref.count++)
> atomic_long_dec(&erofs_global_shrink_cnt);
> - return 0;
> + spin_unlock(&grp->lockref.lock);
> + return true;
> }
May use lockref_get_not_dead() to simplify it a bit?
>
> struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
> @@ -61,7 +60,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
> rcu_read_lock();
> grp = xa_load(&sbi->managed_pslots, index);
> if (grp) {
> - if (erofs_workgroup_get(grp)) {
> + if (!erofs_workgroup_get(grp)) {
> /* prefer to relax rcu read side */
> rcu_read_unlock();
> goto repeat;
> @@ -80,11 +79,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
> struct erofs_workgroup *pre;
>
> /*
> - * Bump up a reference count before making this visible
> - * to others for the XArray in order to avoid potential
> - * UAF without serialized by xa_lock.
> + * Bump up before making this visible to others for the XArray in order
> + * to avoid potential UAF without serialized by xa_lock.
> */
> - atomic_inc(&grp->refcount);
> + lockref_get(&grp->lockref);
>
> repeat:
> xa_lock(&sbi->managed_pslots);
> @@ -93,13 +91,13 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
> if (pre) {
> if (xa_is_err(pre)) {
> pre = ERR_PTR(xa_err(pre));
> - } else if (erofs_workgroup_get(pre)) {
> + } else if (!erofs_workgroup_get(pre)) {
> /* try to legitimize the current in-tree one */
> xa_unlock(&sbi->managed_pslots);
> cond_resched();
> goto repeat;
> }
> - atomic_dec(&grp->refcount);
> + lockref_put_return(&grp->lockref);
Should check return error?
> grp = pre;
> }
> xa_unlock(&sbi->managed_pslots);
> @@ -112,38 +110,35 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
> erofs_workgroup_free_rcu(grp);
> }
>
> -int erofs_workgroup_put(struct erofs_workgroup *grp)
> +void erofs_workgroup_put(struct erofs_workgroup *grp)
> {
> - int count = atomic_dec_return(&grp->refcount);
> + if (lockref_put_not_zero(&grp->lockref))
> + return;
May use lockref_put_or_lock() to avoid following lock?
>
> - if (count == 1)
> + spin_lock(&grp->lockref.lock);
> + DBG_BUGON(__lockref_is_dead(&grp->lockref));
> + if (grp->lockref.count == 1)
> atomic_long_inc(&erofs_global_shrink_cnt);
> - else if (!count)
> - __erofs_workgroup_free(grp);
> - return count;
> + --grp->lockref.count;
> + spin_unlock(&grp->lockref.lock);
> }
>
> static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
> struct erofs_workgroup *grp)
> {
> - /*
> - * If managed cache is on, refcount of workgroups
> - * themselves could be < 0 (freezed). In other words,
> - * there is no guarantee that all refcounts > 0.
> - */
> - if (!erofs_workgroup_try_to_freeze(grp, 1))
> - return false;
> + int free = false;
> +
> + spin_lock(&grp->lockref.lock);
> + if (grp->lockref.count)
> + goto out;
>
> /*
> - * Note that all cached pages should be unattached
> - * before deleted from the XArray. Otherwise some
> - * cached pages could be still attached to the orphan
> - * old workgroup when the new one is available in the tree.
> + * Note that all cached pages should be detached before deleted from
> + * the XArray. Otherwise some cached pages could be still attached to
> + * the orphan old workgroup when the new one is available in the tree.
> */
> - if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
> - erofs_workgroup_unfreeze(grp, 1);
> - return false;
> - }
> + if (erofs_try_to_free_all_cached_pages(sbi, grp))
> + goto out;
>
> /*
> * It's impossible to fail after the workgroup is freezed,
> @@ -152,10 +147,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
> */
> DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
>
> - /* last refcount should be connected with its managed pslot. */
> - erofs_workgroup_unfreeze(grp, 0);
> - __erofs_workgroup_free(grp);
> - return true;
> + lockref_mark_dead(&grp->lockref);
> + free = true;
> +out:
> + spin_unlock(&grp->lockref.lock);
> + if (free)
> + __erofs_workgroup_free(grp);
> + return free;
> }
>
> static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 15a383899540..2ea8e7f08372 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -643,7 +643,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
>
> DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
> /*
> - * refcount of workgroup is now freezed as 1,
> + * refcount of workgroup is now freezed as 0,
> * therefore no need to worry about available decompression users.
> */
> for (i = 0; i < pcl->pclusterpages; ++i) {
> @@ -676,10 +676,11 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
> if (!folio_test_private(folio))
> return true;
>
> - if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
> - return false;
> -
> ret = false;
> + spin_lock(&pcl->obj.lockref.lock);
> + if (pcl->obj.lockref.count > 0)
> + goto out;
> +
> DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
> for (i = 0; i < pcl->pclusterpages; ++i) {
> if (pcl->compressed_bvecs[i].page == &folio->page) {
> @@ -688,10 +689,10 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
> break;
> }
> }
> - erofs_workgroup_unfreeze(&pcl->obj, 1);
> -
> if (ret)
> folio_detach_private(folio);
> +out:
> + spin_unlock(&pcl->obj.lockref.lock);
> return ret;
> }
>
> @@ -807,7 +808,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
> if (IS_ERR(pcl))
> return PTR_ERR(pcl);
>
> - atomic_set(&pcl->obj.refcount, 1);
> + spin_lock_init(&pcl->obj.lockref.lock);
> pcl->algorithmformat = map->m_algorithmformat;
> pcl->length = 0;
> pcl->partial = true;
On 2023/5/29 17:16, Yue Hu wrote:
> On Mon, 29 May 2023 15:29:23 +0800
> Gao Xiang <[email protected]> wrote:
>
>> Let's avoid the current handcrafted lockref although `struct lockref`
>> inclusion usually increases extra 4 bytes with an explicit spinlock if
>> CONFIG_DEBUG_SPINLOCK is off.
>>
>> Apart from the size difference, note that the meaning of refcount is
>> also changed to active users. IOWs, it doesn't take an extra refcount
>> for XArray tree insertion.
>>
>> I don't observe any significant performance difference at least on
>> our cloud compute server but the new one indeed simplifies the
>> overall codebase a bit.
>>
>> Signed-off-by: Gao Xiang <[email protected]>
>> ---
>> changes since v1:
>> - fix reference leaking due to improper fallback of
>> erofs_workgroup_put().
>>
>> fs/erofs/internal.h | 38 ++------------------
>> fs/erofs/utils.c | 86 ++++++++++++++++++++++-----------------------
>> fs/erofs/zdata.c | 15 ++++----
>> 3 files changed, 52 insertions(+), 87 deletions(-)
>>
>> diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
>> index 0b8506c39145..e63f6cd424a0 100644
>> --- a/fs/erofs/internal.h
>> +++ b/fs/erofs/internal.h
>> @@ -208,46 +208,12 @@ enum {
>> EROFS_ZIP_CACHE_READAROUND
>> };
>>
>> -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
>> -
>> /* basic unit of the workstation of a super_block */
>> struct erofs_workgroup {
>> - /* the workgroup index in the workstation */
>> pgoff_t index;
>> -
>> - /* overall workgroup reference count */
>> - atomic_t refcount;
>> + struct lockref lockref;
>> };
>>
>> -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
>> - int val)
>> -{
>> - preempt_disable();
>> - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
>> - preempt_enable();
>> - return false;
>> - }
>> - return true;
>> -}
>> -
>> -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
>> - int orig_val)
>> -{
>> - /*
>> - * other observers should notice all modifications
>> - * in the freezing period.
>> - */
>> - smp_mb();
>> - atomic_set(&grp->refcount, orig_val);
>> - preempt_enable();
>> -}
>> -
>> -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
>> -{
>> - return atomic_cond_read_relaxed(&grp->refcount,
>> - VAL != EROFS_LOCKED_MAGIC);
>> -}
>> -
>> enum erofs_kmap_type {
>> EROFS_NO_KMAP, /* don't map the buffer */
>> EROFS_KMAP, /* use kmap_local_page() to map the buffer */
>> @@ -492,7 +458,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
>> void erofs_release_pages(struct page **pagepool);
>>
>> #ifdef CONFIG_EROFS_FS_ZIP
>> -int erofs_workgroup_put(struct erofs_workgroup *grp);
>> +void erofs_workgroup_put(struct erofs_workgroup *grp);
>> struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
>> pgoff_t index);
>> struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>> diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
>> index 46627cb69abe..6ed79f10e2e2 100644
>> --- a/fs/erofs/utils.c
>> +++ b/fs/erofs/utils.c
>> @@ -33,22 +33,21 @@ void erofs_release_pages(struct page **pagepool)
>> /* global shrink count (for all mounted EROFS instances) */
>> static atomic_long_t erofs_global_shrink_cnt;
>>
>> -static int erofs_workgroup_get(struct erofs_workgroup *grp)
>> +static bool erofs_workgroup_get(struct erofs_workgroup *grp)
>> {
>> - int o;
>> + if (lockref_get_not_zero(&grp->lockref))
>> + return true;
>>
>> -repeat:
>> - o = erofs_wait_on_workgroup_freezed(grp);
>> - if (o <= 0)
>> - return -1;
>> -
>> - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
>> - goto repeat;
>> + spin_lock(&grp->lockref.lock);
>> + if (__lockref_is_dead(&grp->lockref)) {
>> + spin_unlock(&grp->lockref.lock);
>> + return false;
>> + }
>>
>> - /* decrease refcount paired by erofs_workgroup_put */
>> - if (o == 1)
>> + if (!grp->lockref.count++)
>> atomic_long_dec(&erofs_global_shrink_cnt);
>> - return 0;
>> + spin_unlock(&grp->lockref.lock);
>> + return true;
>> }
>
> May use lockref_get_not_dead() to simplify it a bit?
we need to get spin_lock() in the slow path and decrease
erofs_global_shrink_cnt in the lock.
>
>>
>> struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
>> @@ -61,7 +60,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
>> rcu_read_lock();
>> grp = xa_load(&sbi->managed_pslots, index);
>> if (grp) {
>> - if (erofs_workgroup_get(grp)) {
>> + if (!erofs_workgroup_get(grp)) {
>> /* prefer to relax rcu read side */
>> rcu_read_unlock();
>> goto repeat;
>> @@ -80,11 +79,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>> struct erofs_workgroup *pre;
>>
>> /*
>> - * Bump up a reference count before making this visible
>> - * to others for the XArray in order to avoid potential
>> - * UAF without serialized by xa_lock.
>> + * Bump up before making this visible to others for the XArray in order
>> + * to avoid potential UAF without serialized by xa_lock.
>> */
>> - atomic_inc(&grp->refcount);
>> + lockref_get(&grp->lockref);
>>
>> repeat:
>> xa_lock(&sbi->managed_pslots);
>> @@ -93,13 +91,13 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
>> if (pre) {
>> if (xa_is_err(pre)) {
>> pre = ERR_PTR(xa_err(pre));
>> - } else if (erofs_workgroup_get(pre)) {
>> + } else if (!erofs_workgroup_get(pre)) {
>> /* try to legitimize the current in-tree one */
>> xa_unlock(&sbi->managed_pslots);
>> cond_resched();
>> goto repeat;
>> }
>> - atomic_dec(&grp->refcount);
>> + lockref_put_return(&grp->lockref);
>
> Should check return error?
nope, just dec one since it always has a refcount to decrease.
>
>> grp = pre;
>> }
>> xa_unlock(&sbi->managed_pslots);
>> @@ -112,38 +110,35 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
>> erofs_workgroup_free_rcu(grp);
>> }
>>
>> -int erofs_workgroup_put(struct erofs_workgroup *grp)
>> +void erofs_workgroup_put(struct erofs_workgroup *grp)
>> {
>> - int count = atomic_dec_return(&grp->refcount);
>> + if (lockref_put_not_zero(&grp->lockref))
>> + return;
>
> May use lockref_put_or_lock() to avoid following lock?
Thanks! Let me try this.
Thanks,
Gao Xiang
On Sat, 27 May 2023 04:14:53 +0800
Gao Xiang <[email protected]> wrote:
> Hi folks,
>
> These are some cleanups and fixes for the compressed part I'd like to
> aim for the next cycle. I will send several versions if there are more
> patches available. This ongoing cleanup work are also for later folio
> adaption.
>
> I've set up a stress test for this patchset at the same time.
>
> Thanks,
> Gao Xiang
>
> Gao Xiang (6):
> erofs: allocate extra bvec pages directly instead of retrying
> erofs: avoid on-stack pagepool directly passed by arguments
> erofs: kill hooked chains to avoid loops on deduplicated compressed
> images
> erofs: adapt managed inode operations into folios
> erofs: use struct lockref to replace handcrafted approach
> erofs: use poison pointer to replace the hard-coded address
>
> fs/erofs/internal.h | 41 +-------
> fs/erofs/super.c | 62 ------------
> fs/erofs/utils.c | 87 ++++++++--------
> fs/erofs/zdata.c | 238 ++++++++++++++++++++------------------------
> 4 files changed, 156 insertions(+), 272 deletions(-)
>
Reviewed-by: Yue Hu <[email protected]>
Let's avoid the current handcrafted lockref although `struct lockref`
inclusion usually increases extra 4 bytes with an explicit spinlock if
CONFIG_DEBUG_SPINLOCK is off.
Apart from the size difference, note that the meaning of refcount is
also changed to active users. IOWs, it doesn't take an extra refcount
for XArray tree insertion.
I don't observe any significant performance difference at least on
our cloud compute server but the new one indeed simplifies the
overall codebase a bit.
Signed-off-by: Gao Xiang <[email protected]>
---
changes since v2:
- use lockref_put_or_lock() as Yue's suggested;
fs/erofs/internal.h | 38 ++------------------
fs/erofs/utils.c | 85 ++++++++++++++++++++++-----------------------
fs/erofs/zdata.c | 15 ++++----
3 files changed, 51 insertions(+), 87 deletions(-)
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 005f3391bcd3..36e32fa542f0 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -208,46 +208,12 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
-
/* basic unit of the workstation of a super_block */
struct erofs_workgroup {
- /* the workgroup index in the workstation */
pgoff_t index;
-
- /* overall workgroup reference count */
- atomic_t refcount;
+ struct lockref lockref;
};
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- /*
- * other observers should notice all modifications
- * in the freezing period.
- */
- smp_mb();
- atomic_set(&grp->refcount, orig_val);
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- return atomic_cond_read_relaxed(&grp->refcount,
- VAL != EROFS_LOCKED_MAGIC);
-}
-
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
@@ -486,7 +452,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page)
void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
-int erofs_workgroup_put(struct erofs_workgroup *grp);
+void erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 46627cb69abe..4590954cf147 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -33,22 +33,21 @@ void erofs_release_pages(struct page **pagepool)
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-static int erofs_workgroup_get(struct erofs_workgroup *grp)
+static bool erofs_workgroup_get(struct erofs_workgroup *grp)
{
- int o;
+ if (lockref_get_not_zero(&grp->lockref))
+ return true;
-repeat:
- o = erofs_wait_on_workgroup_freezed(grp);
- if (o <= 0)
- return -1;
-
- if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
- goto repeat;
+ spin_lock(&grp->lockref.lock);
+ if (__lockref_is_dead(&grp->lockref)) {
+ spin_unlock(&grp->lockref.lock);
+ return false;
+ }
- /* decrease refcount paired by erofs_workgroup_put */
- if (o == 1)
+ if (!grp->lockref.count++)
atomic_long_dec(&erofs_global_shrink_cnt);
- return 0;
+ spin_unlock(&grp->lockref.lock);
+ return true;
}
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
@@ -61,7 +60,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
rcu_read_lock();
grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
- if (erofs_workgroup_get(grp)) {
+ if (!erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
rcu_read_unlock();
goto repeat;
@@ -80,11 +79,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
struct erofs_workgroup *pre;
/*
- * Bump up a reference count before making this visible
- * to others for the XArray in order to avoid potential
- * UAF without serialized by xa_lock.
+ * Bump up before making this visible to others for the XArray in order
+ * to avoid potential UAF without serialized by xa_lock.
*/
- atomic_inc(&grp->refcount);
+ lockref_get(&grp->lockref);
repeat:
xa_lock(&sbi->managed_pslots);
@@ -93,13 +91,13 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
- } else if (erofs_workgroup_get(pre)) {
+ } else if (!erofs_workgroup_get(pre)) {
/* try to legitimize the current in-tree one */
xa_unlock(&sbi->managed_pslots);
cond_resched();
goto repeat;
}
- atomic_dec(&grp->refcount);
+ lockref_put_return(&grp->lockref);
grp = pre;
}
xa_unlock(&sbi->managed_pslots);
@@ -112,38 +110,34 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp)
erofs_workgroup_free_rcu(grp);
}
-int erofs_workgroup_put(struct erofs_workgroup *grp)
+void erofs_workgroup_put(struct erofs_workgroup *grp)
{
- int count = atomic_dec_return(&grp->refcount);
+ if (lockref_put_or_lock(&grp->lockref))
+ return;
- if (count == 1)
+ DBG_BUGON(__lockref_is_dead(&grp->lockref));
+ if (grp->lockref.count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
- else if (!count)
- __erofs_workgroup_free(grp);
- return count;
+ --grp->lockref.count;
+ spin_unlock(&grp->lockref.lock);
}
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
- /*
- * If managed cache is on, refcount of workgroups
- * themselves could be < 0 (freezed). In other words,
- * there is no guarantee that all refcounts > 0.
- */
- if (!erofs_workgroup_try_to_freeze(grp, 1))
- return false;
+ int free = false;
+
+ spin_lock(&grp->lockref.lock);
+ if (grp->lockref.count)
+ goto out;
/*
- * Note that all cached pages should be unattached
- * before deleted from the XArray. Otherwise some
- * cached pages could be still attached to the orphan
- * old workgroup when the new one is available in the tree.
+ * Note that all cached pages should be detached before deleted from
+ * the XArray. Otherwise some cached pages could be still attached to
+ * the orphan old workgroup when the new one is available in the tree.
*/
- if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
- erofs_workgroup_unfreeze(grp, 1);
- return false;
- }
+ if (erofs_try_to_free_all_cached_pages(sbi, grp))
+ goto out;
/*
* It's impossible to fail after the workgroup is freezed,
@@ -152,10 +146,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
*/
DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
- /* last refcount should be connected with its managed pslot. */
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
- return true;
+ lockref_mark_dead(&grp->lockref);
+ free = true;
+out:
+ spin_unlock(&grp->lockref.lock);
+ if (free)
+ __erofs_workgroup_free(grp);
+ return free;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index c556906354e5..637a964ff110 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -641,7 +641,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
- * refcount of workgroup is now freezed as 1,
+ * refcount of workgroup is now freezed as 0,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
@@ -674,10 +674,11 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
if (!folio_test_private(folio))
return true;
- if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
- return false;
-
ret = false;
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->obj.lockref.count > 0)
+ goto out;
+
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
for (i = 0; i < pcl->pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page == &folio->page) {
@@ -686,10 +687,10 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
break;
}
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
-
if (ret)
folio_detach_private(folio);
+out:
+ spin_unlock(&pcl->obj.lockref.lock);
return ret;
}
@@ -805,7 +806,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- atomic_set(&pcl->obj.refcount, 1);
+ spin_lock_init(&pcl->obj.lockref.lock);
pcl->algorithmformat = map->m_algorithmformat;
pcl->length = 0;
pcl->partial = true;
--
2.24.4
On Mon, 29 May 2023 20:37:27 +0800
Gao Xiang <[email protected]> wrote:
> Let's avoid the current handcrafted lockref although `struct lockref`
> inclusion usually increases extra 4 bytes with an explicit spinlock if
> CONFIG_DEBUG_SPINLOCK is off.
>
> Apart from the size difference, note that the meaning of refcount is
> also changed to active users. IOWs, it doesn't take an extra refcount
> for XArray tree insertion.
>
> I don't observe any significant performance difference at least on
> our cloud compute server but the new one indeed simplifies the
> overall codebase a bit.
>
> Signed-off-by: Gao Xiang <[email protected]>
Reviewed-by: Yue Hu <[email protected]>