I encountered a race issue after lengthy (~594647 sec) stress tests on
a 64k-page arm64 VM with several 4k-block EROFS images. The timing
is like below:
z_erofs_try_inplace_io z_erofs_fill_bio_vec
cmpxchg(&compressed_bvecs[].page,
NULL, ..)
[access bufvec]
compressed_bvecs[] = *bvec;
Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
other fields in bufvec didn't matter. After the subpage block support
is landed, .offset and .end can be used too, but filling bufvec isn't
an atomic operation which can cause inconsistency.
Let's use a spinlock to keep the atomicity of each bufvec. More
specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
since it's rarely used (also it takes a short time if even used) as long
as the pcluster has a reference.
Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
Signed-off-by: Gao Xiang <[email protected]>
---
fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
1 file changed, 38 insertions(+), 36 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 583c062cd0e4..c1c77166b30f 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (i_blocksize(fe->inode) != PAGE_SIZE)
- return;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
for (i = 0; i < pclusterpages; ++i) {
struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
newpage = NULL;
@@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
{
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
- struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *page, *oldpage;
+ struct page *page;
int justfound, bs = i_blocksize(f->inode);
/* Except for inplace pages, the entire page can be used for I/Os */
bvec->bv_offset = 0;
bvec->bv_len = PAGE_SIZE;
repeat:
- oldpage = READ_ONCE(zbv->page);
- if (!oldpage)
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
+ justfound = (unsigned long)page & 1UL;
+ page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
goto out_allocpage;
- justfound = (unsigned long)oldpage & 1UL;
- page = (struct page *)((unsigned long)oldpage & ~1UL);
bvec->bv_page = page;
-
DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
* Handle preallocated cached pages. We tried to allocate such pages
@@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
set_page_private(page, 0);
- WRITE_ONCE(zbv->page, page);
tocache = true;
goto out_tocache;
}
@@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* therefore it is impossible for `mapping` to be NULL.
*/
if (mapping && mapping != mc) {
- if (zbv->offset < 0)
- bvec->bv_offset = round_up(-zbv->offset, bs);
- bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
return;
}
@@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
/* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(zbv->page, page);
/*
* The cached page is still available but without a valid
* `->private` pcluster hint. Let's reconnect them.
@@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
put_page(page);
out_allocpage:
page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
bvec->bv_page = page;
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
@@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset))
goto submit_bio_retry;
--
2.39.3
On Thu, 25 Jan 2024 20:00:39 +0800
Gao Xiang <[email protected]> wrote:
> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
> is like below:
>
> z_erofs_try_inplace_io z_erofs_fill_bio_vec
> cmpxchg(&compressed_bvecs[].page,
> NULL, ..)
> [access bufvec]
> compressed_bvecs[] = *bvec;
>
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter. After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
>
> Let's use a spinlock to keep the atomicity of each bufvec. More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
>
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <[email protected]>
> ---
> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
> 1 file changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
> unsigned int i;
>
> - if (i_blocksize(fe->inode) != PAGE_SIZE)
> - return;
> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> return;
>
> for (i = 0; i < pclusterpages; ++i) {
> struct page *page, *newpage;
> void *t; /* mark pages just found for debugging */
>
> - /* the compressed page was loaded before */
> + /* Inaccurate check w/o locking to avoid unneeded lookups */
> if (READ_ONCE(pcl->compressed_bvecs[i].page))
> continue;
>
> page = find_get_page(mc, pcl->obj.index + i);
> -
> if (page) {
> t = (void *)((unsigned long)page | 1);
> newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
> t = (void *)((unsigned long)newpage | 1);
> }
> -
> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> + spin_lock(&pcl->obj.lockref.lock);
> + if (!pcl->compressed_bvecs[i].page) {
> + pcl->compressed_bvecs[i].page = t;
> + spin_unlock(&pcl->obj.lockref.lock);
> continue;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
>
> if (page)
> put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
> return 0;
> }
>
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> - struct z_erofs_bvec *bvec)
> -{
> - struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> - while (fe->icur > 0) {
> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> - NULL, bvec->page)) {
> - pcl->compressed_bvecs[fe->icur] = *bvec;
> - return true;
> - }
> - }
> - return false;
> -}
> -
> /* callers must be with pcluster lock held */
> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
> struct z_erofs_bvec *bvec, bool exclusive)
> {
> + struct z_erofs_pcluster *pcl = fe->pcl;
> int ret;
>
> if (exclusive) {
> /* give priority for inplaceio to use file pages first */
> - if (z_erofs_try_inplace_io(fe, bvec))
> + spin_lock(&pcl->obj.lockref.lock);
> + while (fe->icur > 0) {
> + if (pcl->compressed_bvecs[--fe->icur].page)
> + continue;
> + pcl->compressed_bvecs[fe->icur] = *bvec;
> + spin_unlock(&pcl->obj.lockref.lock);
> return 0;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
> +
> /* otherwise, check if it can be used as a bvpage */
> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
> !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> {
> gfp_t gfp = mapping_gfp_mask(mc);
> bool tocache = false;
> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> + struct z_erofs_bvec zbv;
> struct address_space *mapping;
> - struct page *page, *oldpage;
> + struct page *page;
> int justfound, bs = i_blocksize(f->inode);
>
> /* Except for inplace pages, the entire page can be used for I/Os */
> bvec->bv_offset = 0;
> bvec->bv_len = PAGE_SIZE;
> repeat:
> - oldpage = READ_ONCE(zbv->page);
> - if (!oldpage)
> + spin_lock(&pcl->obj.lockref.lock);
> + zbv = pcl->compressed_bvecs[nr];
> + page = zbv.page;
> + justfound = (unsigned long)page & 1UL;
> + page = (struct page *)((unsigned long)page & ~1UL);
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> + if (!page)
> goto out_allocpage;
>
> - justfound = (unsigned long)oldpage & 1UL;
> - page = (struct page *)((unsigned long)oldpage & ~1UL);
> bvec->bv_page = page;
> -
> DBG_BUGON(z_erofs_is_shortlived_page(page));
> /*
> * Handle preallocated cached pages. We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> */
> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
> set_page_private(page, 0);
> - WRITE_ONCE(zbv->page, page);
> tocache = true;
> goto out_tocache;
> }
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> * therefore it is impossible for `mapping` to be NULL.
> */
> if (mapping && mapping != mc) {
> - if (zbv->offset < 0)
> - bvec->bv_offset = round_up(-zbv->offset, bs);
> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> + if (zbv.offset < 0)
> + bvec->bv_offset = round_up(-zbv.offset, bs);
> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
> return;
> }
>
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>
> /* the cached page is still in managed cache */
> if (page->mapping == mc) {
> - WRITE_ONCE(zbv->page, page);
> /*
> * The cached page is still available but without a valid
> * `->private` pcluster hint. Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> put_page(page);
> out_allocpage:
> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> + spin_lock(&pcl->obj.lockref.lock);
> + if (pcl->compressed_bvecs[nr].page) {
> erofs_pagepool_add(&f->pagepool, page);
> + spin_unlock(&pcl->obj.lockref.lock);
> cond_resched();
> goto repeat;
> }
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> bvec->bv_page = page;
> out_tocache:
> if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>
> if (cur + bvec.bv_len > end)
> bvec.bv_len = end - cur;
> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
> bvec.bv_offset))
> goto submit_bio_retry;
Looks good to me.
Reviewed-by: Yue Hu <[email protected]>
On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <[email protected]> wrote:
>
> I encountered a race issue after lengthy (~594647 sec) stress tests on
> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
> is like below:
>
> z_erofs_try_inplace_io z_erofs_fill_bio_vec
> cmpxchg(&compressed_bvecs[].page,
> NULL, ..)
> [access bufvec]
> compressed_bvecs[] = *bvec;
>
> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
> other fields in bufvec didn't matter. After the subpage block support
> is landed, .offset and .end can be used too, but filling bufvec isn't
> an atomic operation which can cause inconsistency.
>
> Let's use a spinlock to keep the atomicity of each bufvec. More
> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
> since it's rarely used (also it takes a short time if even used) as long
> as the pcluster has a reference.
>
> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
> Signed-off-by: Gao Xiang <[email protected]>
> ---
> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
> 1 file changed, 38 insertions(+), 36 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index 583c062cd0e4..c1c77166b30f 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
> unsigned int i;
>
> - if (i_blocksize(fe->inode) != PAGE_SIZE)
> - return;
> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
> return;
>
> for (i = 0; i < pclusterpages; ++i) {
> struct page *page, *newpage;
> void *t; /* mark pages just found for debugging */
>
> - /* the compressed page was loaded before */
> + /* Inaccurate check w/o locking to avoid unneeded lookups */
> if (READ_ONCE(pcl->compressed_bvecs[i].page))
> continue;
>
> page = find_get_page(mc, pcl->obj.index + i);
> -
> if (page) {
> t = (void *)((unsigned long)page | 1);
> newpage = NULL;
> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
> t = (void *)((unsigned long)newpage | 1);
> }
> -
> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
> + spin_lock(&pcl->obj.lockref.lock);
> + if (!pcl->compressed_bvecs[i].page) {
> + pcl->compressed_bvecs[i].page = t;
> + spin_unlock(&pcl->obj.lockref.lock);
> continue;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
>
> if (page)
> put_page(page);
> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
> return 0;
> }
>
> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
> - struct z_erofs_bvec *bvec)
> -{
> - struct z_erofs_pcluster *const pcl = fe->pcl;
> -
> - while (fe->icur > 0) {
> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
> - NULL, bvec->page)) {
> - pcl->compressed_bvecs[fe->icur] = *bvec;
> - return true;
> - }
> - }
> - return false;
> -}
> -
> /* callers must be with pcluster lock held */
> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
> struct z_erofs_bvec *bvec, bool exclusive)
> {
> + struct z_erofs_pcluster *pcl = fe->pcl;
> int ret;
>
> if (exclusive) {
> /* give priority for inplaceio to use file pages first */
> - if (z_erofs_try_inplace_io(fe, bvec))
> + spin_lock(&pcl->obj.lockref.lock);
> + while (fe->icur > 0) {
> + if (pcl->compressed_bvecs[--fe->icur].page)
> + continue;
> + pcl->compressed_bvecs[fe->icur] = *bvec;
> + spin_unlock(&pcl->obj.lockref.lock);
> return 0;
> + }
> + spin_unlock(&pcl->obj.lockref.lock);
> +
> /* otherwise, check if it can be used as a bvpage */
> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
> !fe->candidate_bvpage)
> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> {
> gfp_t gfp = mapping_gfp_mask(mc);
> bool tocache = false;
> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
> + struct z_erofs_bvec zbv;
> struct address_space *mapping;
> - struct page *page, *oldpage;
> + struct page *page;
> int justfound, bs = i_blocksize(f->inode);
>
> /* Except for inplace pages, the entire page can be used for I/Os */
> bvec->bv_offset = 0;
> bvec->bv_len = PAGE_SIZE;
> repeat:
> - oldpage = READ_ONCE(zbv->page);
> - if (!oldpage)
> + spin_lock(&pcl->obj.lockref.lock);
> + zbv = pcl->compressed_bvecs[nr];
> + page = zbv.page;
> + justfound = (unsigned long)page & 1UL;
> + page = (struct page *)((unsigned long)page & ~1UL);
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> + if (!page)
> goto out_allocpage;
>
> - justfound = (unsigned long)oldpage & 1UL;
> - page = (struct page *)((unsigned long)oldpage & ~1UL);
> bvec->bv_page = page;
> -
> DBG_BUGON(z_erofs_is_shortlived_page(page));
> /*
> * Handle preallocated cached pages. We tried to allocate such pages
> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> */
> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
> set_page_private(page, 0);
> - WRITE_ONCE(zbv->page, page);
> tocache = true;
> goto out_tocache;
> }
> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> * therefore it is impossible for `mapping` to be NULL.
> */
> if (mapping && mapping != mc) {
> - if (zbv->offset < 0)
> - bvec->bv_offset = round_up(-zbv->offset, bs);
> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
> + if (zbv.offset < 0)
> + bvec->bv_offset = round_up(-zbv.offset, bs);
> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
> return;
> }
>
> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>
> /* the cached page is still in managed cache */
> if (page->mapping == mc) {
> - WRITE_ONCE(zbv->page, page);
> /*
> * The cached page is still available but without a valid
> * `->private` pcluster hint. Let's reconnect them.
> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
> put_page(page);
> out_allocpage:
> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
> + spin_lock(&pcl->obj.lockref.lock);
> + if (pcl->compressed_bvecs[nr].page) {
> erofs_pagepool_add(&f->pagepool, page);
> + spin_unlock(&pcl->obj.lockref.lock);
> cond_resched();
> goto repeat;
> }
> + pcl->compressed_bvecs[nr].page = page;
> + spin_unlock(&pcl->obj.lockref.lock);
> bvec->bv_page = page;
> out_tocache:
> if (!tocache || bs != PAGE_SIZE ||
> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>
> if (cur + bvec.bv_len > end)
> bvec.bv_len = end - cur;
> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
> bvec.bv_offset))
> goto submit_bio_retry;
> --
> 2.39.3
>
LGTM!
Reviewed-by: Sandeep Dhavale <[email protected]>
On 2024/1/26 12:56, Sandeep Dhavale via Linux-erofs wrote:
> On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <[email protected]> wrote:
>>
>> I encountered a race issue after lengthy (~594647 sec) stress tests on
>> a 64k-page arm64 VM with several 4k-block EROFS images. The timing
>> is like below:
>>
>> z_erofs_try_inplace_io z_erofs_fill_bio_vec
>> cmpxchg(&compressed_bvecs[].page,
>> NULL, ..)
>> [access bufvec]
>> compressed_bvecs[] = *bvec;
>>
>> Previously, z_erofs_submit_queue() just accessed bufvec->page only, so
>> other fields in bufvec didn't matter. After the subpage block support
>> is landed, .offset and .end can be used too, but filling bufvec isn't
>> an atomic operation which can cause inconsistency.
>>
>> Let's use a spinlock to keep the atomicity of each bufvec. More
>> specifically, just reuse the existing spinlock `pcl->obj.lockref.lock`
>> since it's rarely used (also it takes a short time if even used) as long
>> as the pcluster has a reference.
>>
>> Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed blocks")
>> Signed-off-by: Gao Xiang <[email protected]>
>> ---
>> fs/erofs/zdata.c | 74 +++++++++++++++++++++++++-----------------------
>> 1 file changed, 38 insertions(+), 36 deletions(-)
>>
>> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
>> index 583c062cd0e4..c1c77166b30f 100644
>> --- a/fs/erofs/zdata.c
>> +++ b/fs/erofs/zdata.c
>> @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
>> unsigned int i;
>>
>> - if (i_blocksize(fe->inode) != PAGE_SIZE)
>> - return;
>> - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> + if (i_blocksize(fe->inode) != PAGE_SIZE ||
>> + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
>> return;
>>
>> for (i = 0; i < pclusterpages; ++i) {
>> struct page *page, *newpage;
>> void *t; /* mark pages just found for debugging */
>>
>> - /* the compressed page was loaded before */
>> + /* Inaccurate check w/o locking to avoid unneeded lookups */
>> if (READ_ONCE(pcl->compressed_bvecs[i].page))
>> continue;
>>
>> page = find_get_page(mc, pcl->obj.index + i);
>> -
>> if (page) {
>> t = (void *)((unsigned long)page | 1);
>> newpage = NULL;
>> @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
>> set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
>> t = (void *)((unsigned long)newpage | 1);
>> }
>> -
>> - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (!pcl->compressed_bvecs[i].page) {
>> + pcl->compressed_bvecs[i].page = t;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> continue;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>>
>> if (page)
>> put_page(page);
>> @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb)
>> return 0;
>> }
>>
>> -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
>> - struct z_erofs_bvec *bvec)
>> -{
>> - struct z_erofs_pcluster *const pcl = fe->pcl;
>> -
>> - while (fe->icur > 0) {
>> - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
>> - NULL, bvec->page)) {
>> - pcl->compressed_bvecs[fe->icur] = *bvec;
>> - return true;
>> - }
>> - }
>> - return false;
>> -}
>> -
>> /* callers must be with pcluster lock held */
>> static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
>> struct z_erofs_bvec *bvec, bool exclusive)
>> {
>> + struct z_erofs_pcluster *pcl = fe->pcl;
>> int ret;
>>
>> if (exclusive) {
>> /* give priority for inplaceio to use file pages first */
>> - if (z_erofs_try_inplace_io(fe, bvec))
>> + spin_lock(&pcl->obj.lockref.lock);
>> + while (fe->icur > 0) {
>> + if (pcl->compressed_bvecs[--fe->icur].page)
>> + continue;
>> + pcl->compressed_bvecs[fe->icur] = *bvec;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> return 0;
>> + }
>> + spin_unlock(&pcl->obj.lockref.lock);
>> +
>> /* otherwise, check if it can be used as a bvpage */
>> if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
>> !fe->candidate_bvpage)
>> @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> {
>> gfp_t gfp = mapping_gfp_mask(mc);
>> bool tocache = false;
>> - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr;
>> + struct z_erofs_bvec zbv;
>> struct address_space *mapping;
>> - struct page *page, *oldpage;
>> + struct page *page;
>> int justfound, bs = i_blocksize(f->inode);
>>
>> /* Except for inplace pages, the entire page can be used for I/Os */
>> bvec->bv_offset = 0;
>> bvec->bv_len = PAGE_SIZE;
>> repeat:
>> - oldpage = READ_ONCE(zbv->page);
>> - if (!oldpage)
>> + spin_lock(&pcl->obj.lockref.lock);
>> + zbv = pcl->compressed_bvecs[nr];
>> + page = zbv.page;
>> + justfound = (unsigned long)page & 1UL;
>> + page = (struct page *)((unsigned long)page & ~1UL);
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> + if (!page)
>> goto out_allocpage;
>>
>> - justfound = (unsigned long)oldpage & 1UL;
>> - page = (struct page *)((unsigned long)oldpage & ~1UL);
>> bvec->bv_page = page;
>> -
>> DBG_BUGON(z_erofs_is_shortlived_page(page));
>> /*
>> * Handle preallocated cached pages. We tried to allocate such pages
>> @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> */
>> if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
>> set_page_private(page, 0);
>> - WRITE_ONCE(zbv->page, page);
>> tocache = true;
>> goto out_tocache;
>> }
>> @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> * therefore it is impossible for `mapping` to be NULL.
>> */
>> if (mapping && mapping != mc) {
>> - if (zbv->offset < 0)
>> - bvec->bv_offset = round_up(-zbv->offset, bs);
>> - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset;
>> + if (zbv.offset < 0)
>> + bvec->bv_offset = round_up(-zbv.offset, bs);
>> + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
>> return;
>> }
>>
>> @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>>
>> /* the cached page is still in managed cache */
>> if (page->mapping == mc) {
>> - WRITE_ONCE(zbv->page, page);
>> /*
>> * The cached page is still available but without a valid
>> * `->private` pcluster hint. Let's reconnect them.
>> @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
>> put_page(page);
>> out_allocpage:
>> page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
>> - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) {
>> + spin_lock(&pcl->obj.lockref.lock);
>> + if (pcl->compressed_bvecs[nr].page) {
>> erofs_pagepool_add(&f->pagepool, page);
>> + spin_unlock(&pcl->obj.lockref.lock);
>> cond_resched();
>> goto repeat;
>> }
>> + pcl->compressed_bvecs[nr].page = page;
>> + spin_unlock(&pcl->obj.lockref.lock);
>> bvec->bv_page = page;
>> out_tocache:
>> if (!tocache || bs != PAGE_SIZE ||
>> @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
>>
>> if (cur + bvec.bv_len > end)
>> bvec.bv_len = end - cur;
>> + DBG_BUGON(bvec.bv_len < sb->s_blocksize);
>> if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
>> bvec.bv_offset))
>> goto submit_bio_retry;
>> --
>> 2.39.3
>>
>
> LGTM!
>
> Reviewed-by: Sandeep Dhavale <[email protected]>
Thanks for the review :-)
Thanks,
Gao Xiang