LinuxLists.cc - [PATCH] f2fs: handle decompress only post processing in softirq

2022-06-13 19:12:28

Subject: [PATCH] f2fs: handle decompress only post processing in softirq

From: Daeho Jeong <[email protected]>

Now decompression is being handled in workqueue and it makes read I/O
latency non-deterministic, because of the non-deterministic scheduling
nature of workqueues. So, I made it handled in softirq context only if
possible.

Signed-off-by: Daeho Jeong <[email protected]>
---
fs/f2fs/compress.c | 145 +++++++++++++++++++++++++--------------------
fs/f2fs/data.c | 50 ++++++++++------
fs/f2fs/f2fs.h | 10 ++--
3 files changed, 119 insertions(+), 86 deletions(-)

diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 24824cd96f36..8bc4cd0390dc 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -729,14 +729,13 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
return ret;
}

-void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_softirq)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
int ret;
- int i;

trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx,
dic->cluster_size, fi->i_compress_algorithm);
@@ -746,49 +745,12 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
goto out_end_io;
}

- dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
- if (!dic->tpages) {
- ret = -ENOMEM;
- goto out_end_io;
- }
-
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->rpages[i]) {
- dic->tpages[i] = dic->rpages[i];
- continue;
- }
-
- dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i]) {
- ret = -ENOMEM;
- goto out_end_io;
- }
- }
-
- if (cops->init_decompress_ctx) {
- ret = cops->init_decompress_ctx(dic);
- if (ret)
- goto out_end_io;
- }
-
- dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
- if (!dic->rbuf) {
- ret = -ENOMEM;
- goto out_destroy_decompress_ctx;
- }
-
- dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
- if (!dic->cbuf) {
- ret = -ENOMEM;
- goto out_vunmap_rbuf;
- }
-
dic->clen = le32_to_cpu(dic->cbuf->clen);
dic->rlen = PAGE_SIZE << dic->log_cluster_size;

if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) {
ret = -EFSCORRUPTED;
- goto out_vunmap_cbuf;
+ goto out_end_io;
}

ret = cops->decompress_pages(dic);
@@ -809,17 +771,10 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
}
}

-out_vunmap_cbuf:
- vm_unmap_ram(dic->cbuf, dic->nr_cpages);
-out_vunmap_rbuf:
- vm_unmap_ram(dic->rbuf, dic->cluster_size);
-out_destroy_decompress_ctx:
- if (cops->destroy_decompress_ctx)
- cops->destroy_decompress_ctx(dic);
out_end_io:
trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx,
dic->clen, ret);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, in_softirq);
}

/*
@@ -829,7 +784,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic)
* (or in the case of a failure, cleans up without actually decompressing).
*/
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr)
+ block_t blkaddr, bool in_softirq)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);
@@ -839,12 +794,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed,

if (failed)
WRITE_ONCE(dic->failed, true);
- else if (blkaddr)
+ else if (blkaddr && !in_softirq)
f2fs_cache_compressed_page(sbi, page,
dic->inode->i_ino, blkaddr);

if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, in_softirq);
}

static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index)
@@ -1552,12 +1507,14 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
return err;
}

-static void f2fs_free_dic(struct decompress_io_ctx *dic);
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback);

struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
{
struct decompress_io_ctx *dic;
pgoff_t start_idx = start_idx_of_cluster(cc);
+ const struct f2fs_compress_ops *cops;
int i;

dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO,
@@ -1602,17 +1559,60 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->cpages[i] = page;
}

+ dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
+ if (!dic->tpages)
+ goto out_free;
+
+ for (i = 0; i < dic->cluster_size; i++) {
+ if (dic->rpages[i]) {
+ dic->tpages[i] = dic->rpages[i];
+ continue;
+ }
+
+ dic->tpages[i] = f2fs_compress_alloc_page();
+ if (!dic->tpages[i])
+ goto out_free;
+ }
+
+ dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
+ if (!dic->rbuf)
+ goto out_free;
+
+ dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages);
+ if (!dic->cbuf)
+ goto out_free;
+
+ cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ if (cops->init_decompress_ctx) {
+ int ret = cops->init_decompress_ctx(dic);
+
+ if (ret)
+ goto out_free;
+ }
+
return dic;

out_free:
- f2fs_free_dic(dic);
+ f2fs_free_dic(dic, true);
return ERR_PTR(-ENOMEM);
}

-static void f2fs_free_dic(struct decompress_io_ctx *dic)
+static void f2fs_free_dic(struct decompress_io_ctx *dic,
+ bool bypass_destroy_callback)
{
+ const struct f2fs_compress_ops *cops =
+ f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
int i;

+ if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
+
+ if (dic->cbuf)
+ vm_unmap_ram(dic->cbuf, dic->nr_cpages);
+
+ if (dic->rbuf)
+ vm_unmap_ram(dic->rbuf, dic->cluster_size);
+
if (dic->tpages) {
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
@@ -1637,17 +1637,33 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic)
kmem_cache_free(dic_entry_slab, dic);
}

-static void f2fs_put_dic(struct decompress_io_ctx *dic)
+static void f2fs_late_free_dic(struct work_struct *work)
{
- if (refcount_dec_and_test(&dic->refcnt))
- f2fs_free_dic(dic);
+ struct decompress_io_ctx *dic =
+ container_of(work, struct decompress_io_ctx, free_work);
+
+ f2fs_free_dic(dic, false);
+}
+
+static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_softirq)
+{
+ if (refcount_dec_and_test(&dic->refcnt)) {
+ if (in_softirq) {
+ INIT_WORK(&dic->free_work, f2fs_late_free_dic);
+ queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
+ &dic->free_work);
+ } else {
+ f2fs_free_dic(dic, false);
+ }
+ }
}

/*
* Update and unlock the cluster's pagecache pages, and release the reference to
* the decompress_io_ctx that was being held for I/O completion.
*/
-static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_softirq)
{
int i;

@@ -1668,7 +1684,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
unlock_page(rpage);
}

- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_softirq);
}

static void f2fs_verify_cluster(struct work_struct *work)
@@ -1685,14 +1701,15 @@ static void f2fs_verify_cluster(struct work_struct *work)
SetPageError(rpage);
}

- __f2fs_decompress_end_io(dic, false);
+ __f2fs_decompress_end_io(dic, false, false);
}

/*
* This is called when a compressed cluster has been decompressed
* (or failed to be read and/or decompressed).
*/
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_softirq)
{
if (!failed && dic->need_verity) {
/*
@@ -1704,7 +1721,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
INIT_WORK(&dic->verity_work, f2fs_verify_cluster);
fsverity_enqueue_verify_work(&dic->verity_work);
} else {
- __f2fs_decompress_end_io(dic, failed);
+ __f2fs_decompress_end_io(dic, failed, in_softirq);
}
}

@@ -1713,12 +1730,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed)
*
* This is called when the page is no longer needed and can be freed.
*/
-void f2fs_put_page_dic(struct page *page)
+void f2fs_put_page_dic(struct page *page, bool in_softirq)
{
struct decompress_io_ctx *dic =
(struct decompress_io_ctx *)page_private(page);

- f2fs_put_dic(dic);
+ f2fs_put_dic(dic, in_softirq);
}

/*
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7fcbcf979737..87455350c9e4 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -119,7 +119,7 @@ struct bio_post_read_ctx {
block_t fs_blkaddr;
};

-static void f2fs_finish_read_bio(struct bio *bio)
+static void f2fs_finish_read_bio(struct bio *bio, bool in_softirq)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -133,8 +133,9 @@ static void f2fs_finish_read_bio(struct bio *bio)

if (f2fs_is_compressed_page(page)) {
if (bio->bi_status)
- f2fs_end_read_compressed_page(page, true, 0);
- f2fs_put_page_dic(page);
+ f2fs_end_read_compressed_page(page, true, 0,
+ in_softirq);
+ f2fs_put_page_dic(page, in_softirq);
continue;
}

@@ -191,7 +192,7 @@ static void f2fs_verify_bio(struct work_struct *work)
fsverity_verify_bio(bio);
}

- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, false);
}

/*
@@ -203,7 +204,7 @@ static void f2fs_verify_bio(struct work_struct *work)
* can involve reading verity metadata pages from the file, and these verity
* metadata pages may be encrypted and/or compressed.
*/
-static void f2fs_verify_and_finish_bio(struct bio *bio)
+static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_softirq)
{
struct bio_post_read_ctx *ctx = bio->bi_private;

@@ -211,7 +212,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
INIT_WORK(&ctx->work, f2fs_verify_bio);
fsverity_enqueue_verify_work(&ctx->work);
} else {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, in_softirq);
}
}

@@ -224,7 +225,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio)
* that the bio includes at least one compressed page. The actual decompression
* is done on a per-cluster basis, not a per-bio basis.
*/
-static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
+static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
+ bool in_softirq)
{
struct bio_vec *bv;
struct bvec_iter_all iter_all;
@@ -237,7 +239,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx)
/* PG_error was set if decryption failed. */
if (f2fs_is_compressed_page(page))
f2fs_end_read_compressed_page(page, PageError(page),
- blkaddr);
+ blkaddr, in_softirq);
else
all_compressed = false;

@@ -262,9 +264,9 @@ static void f2fs_post_read_work(struct work_struct *work)
fscrypt_decrypt_bio(ctx->bio);

if (ctx->enabled_steps & STEP_DECOMPRESS)
- f2fs_handle_step_decompress(ctx);
+ f2fs_handle_step_decompress(ctx, false);

- f2fs_verify_and_finish_bio(ctx->bio);
+ f2fs_verify_and_finish_bio(ctx->bio, false);
}

static void f2fs_read_end_io(struct bio *bio)
@@ -281,16 +283,28 @@ static void f2fs_read_end_io(struct bio *bio)
}

if (bio->bi_status) {
- f2fs_finish_read_bio(bio);
+ f2fs_finish_read_bio(bio, true);
return;
}

- if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
- INIT_WORK(&ctx->work, f2fs_post_read_work);
- queue_work(ctx->sbi->post_read_wq, &ctx->work);
- } else {
- f2fs_verify_and_finish_bio(bio);
+ if (ctx) {
+ unsigned int enabled_steps = ctx->enabled_steps &
+ (STEP_DECRYPT | STEP_DECOMPRESS);
+
+ /*
+ * If we have only decompression step between decompression and
+ * decrypt, we don't need post processing for this.
+ */
+ if (enabled_steps == STEP_DECOMPRESS) {
+ f2fs_handle_step_decompress(ctx, true);
+ } else if (enabled_steps) {
+ INIT_WORK(&ctx->work, f2fs_post_read_work);
+ queue_work(ctx->sbi->post_read_wq, &ctx->work);
+ return;
+ }
}
+
+ f2fs_verify_and_finish_bio(bio, true);
}

static void f2fs_write_end_io(struct bio *bio)
@@ -2222,7 +2236,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,

if (f2fs_load_compressed_page(sbi, page, blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages))
- f2fs_decompress_cluster(dic);
+ f2fs_decompress_cluster(dic, false);
continue;
}

@@ -2240,7 +2254,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
- f2fs_decompress_end_io(dic, ret);
+ f2fs_decompress_end_io(dic, ret, false);
f2fs_put_dnode(&dn);
*bio_ret = NULL;
return ret;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9bbecd008d2..be118d332536 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1580,6 +1580,7 @@ struct decompress_io_ctx {
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
+ struct work_struct free_work; /* work for late free this structure itself */
};

#define NULL_CLUSTER ((unsigned int)(~0))
@@ -4158,9 +4159,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
bool f2fs_is_compress_backend_ready(struct inode *inode);
int f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
-void f2fs_decompress_cluster(struct decompress_io_ctx *dic);
+void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_softirq);
void f2fs_end_read_compressed_page(struct page *page, bool failed,
- block_t blkaddr);
+ block_t blkaddr, bool in_softirq);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec,
@@ -4179,8 +4180,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
-void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed);
-void f2fs_put_page_dic(struct page *page);
+void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
+ bool in_softirq);
+void f2fs_put_page_dic(struct page *page, bool in_softirq);
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
--
2.36.1.476.g0c4daa206d-goog

2022-06-14 05:50:35

by Eric Biggers

[permalink] [raw]

Subject: Re: [PATCH] f2fs: handle decompress only post processing in softirq

[+Cc Nathan Huckleberry who is looking into a similar problem in dm-verity]

On Mon, Jun 13, 2022 at 08:56:12AM -0700, Daeho Jeong wrote:
> From: Daeho Jeong <[email protected]>
>
> Now decompression is being handled in workqueue and it makes read I/O
> latency non-deterministic, because of the non-deterministic scheduling
> nature of workqueues. So, I made it handled in softirq context only if
> possible.
>
> Signed-off-by: Daeho Jeong <[email protected]>
> ---
> fs/f2fs/compress.c | 145 +++++++++++++++++++++++++--------------------
> fs/f2fs/data.c | 50 ++++++++++------
> fs/f2fs/f2fs.h | 10 ++--
> 3 files changed, 119 insertions(+), 86 deletions(-)
[...]
> static void f2fs_read_end_io(struct bio *bio)
> @@ -281,16 +283,28 @@ static void f2fs_read_end_io(struct bio *bio)
> }
>
> if (bio->bi_status) {
> - f2fs_finish_read_bio(bio);
> + f2fs_finish_read_bio(bio, true);
> return;
> }
>
> - if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) {
> - INIT_WORK(&ctx->work, f2fs_post_read_work);
> - queue_work(ctx->sbi->post_read_wq, &ctx->work);
> - } else {
> - f2fs_verify_and_finish_bio(bio);
> + if (ctx) {
> + unsigned int enabled_steps = ctx->enabled_steps &
> + (STEP_DECRYPT | STEP_DECOMPRESS);
> +
> + /*
> + * If we have only decompression step between decompression and
> + * decrypt, we don't need post processing for this.
> + */
> + if (enabled_steps == STEP_DECOMPRESS) {
> + f2fs_handle_step_decompress(ctx, true);

One question: is this (the bio endio callback) actually guaranteed to be
executed from a softirq? If you look at dm-crypt's support for workqueue-less
decryption, for example, it explicitly checks 'in_hardirq() || irqs_disabled()'
and schedules a tasklet if either of those is the case.

- Eric

2022-06-14 09:50:27

by Gao Xiang

[permalink] [raw]

Subject: Re: [f2fs-dev] [PATCH] f2fs: handle decompress only post processing in softirq

Hi all,

On Mon, Jun 13, 2022 at 10:38:25PM -0700, Eric Biggers wrote:
> [+Cc Nathan Huckleberry who is looking into a similar problem in dm-verity]
>
> On Mon, Jun 13, 2022 at 08:56:12AM -0700, Daeho Jeong wrote:
> > From: Daeho Jeong <[email protected]>
> >
> > Now decompression is being handled in workqueue and it makes read I/O
> > latency non-deterministic, because of the non-deterministic scheduling
> > nature of workqueues. So, I made it handled in softirq context only if
> > possible.
> >
> > Signed-off-by: Daeho Jeong <[email protected]>

...

>
> One question: is this (the bio endio callback) actually guaranteed to be
> executed from a softirq? If you look at dm-crypt's support for workqueue-less
> decryption, for example, it explicitly checks 'in_hardirq() || irqs_disabled()'
> and schedules a tasklet if either of those is the case.
>
> - Eric
>

Some my own previous thoughts about this strategy:

- If we allocate all memory and map these before I/Os, all inflight I/Os
will keep such temporary pages all the time until decompression is
finished. In contrast, if we allocate or reuse such pages just before
decompression, it would minimize the memory footprints.

I think it will impact the memory numbers at least on the very
low-ended devices with bslow storage. (I've seen f2fs has some big
mempool already)

- Many compression algorithms are not suitable in the softirq contexts,
also I vaguely remembered if softirq context lasts for > 2ms, it will
push into ksoftirqd instead so it's actually another process context.
And it may delay other important interrupt handling.

- Go back to the non-deterministic scheduling of workqueues. I guess it
may be just due to scheduling punishment due to a lot of CPU consuming
due to decompression before so the priority becomes low, but that is
just a pure guess. May be we need to use RT scheduling policy instead.

At least with WQ_HIGHPRI for dm-verity at least, but I don't find
WQ_HIGHPRI mark for dm-verity.

Thanks,
Gao Xiang

2022-06-14 16:43:43

by Daeho Jeong

[permalink] [raw]

Subject: Re: [PATCH] f2fs: handle decompress only post processing in softirq

> One question: is this (the bio endio callback) actually guaranteed to be
> executed from a softirq? If you look at dm-crypt's support for workqueue-less
> decryption, for example, it explicitly checks 'in_hardirq() || irqs_disabled()'
> and schedules a tasklet if either of those is the case.
>
> - Eric

Oh, you're right. Even though it's safe to defer all the release
process as a work in end_io function, it's better to check the
condition and process the release process right away if possible.

Thanks,

2022-06-14 17:09:41

by Daeho Jeong

[permalink] [raw]

Subject: Re: [f2fs-dev] [PATCH] f2fs: handle decompress only post processing in softirq

>
> Some my own previous thoughts about this strategy:
>
> - If we allocate all memory and map these before I/Os, all inflight I/Os
> will keep such temporary pages all the time until decompression is
> finished. In contrast, if we allocate or reuse such pages just before
> decompression, it would minimize the memory footprints.
>
> I think it will impact the memory numbers at least on the very
> low-ended devices with bslow storage. (I've seen f2fs has some big
> mempool already)
>
> - Many compression algorithms are not suitable in the softirq contexts,
> also I vaguely remembered if softirq context lasts for > 2ms, it will
> push into ksoftirqd instead so it's actually another process context.
> And it may delay other important interrupt handling.
>
> - Go back to the non-deterministic scheduling of workqueues. I guess it
> may be just due to scheduling punishment due to a lot of CPU consuming
> due to decompression before so the priority becomes low, but that is
> just a pure guess. May be we need to use RT scheduling policy instead.
>
> At least with WQ_HIGHPRI for dm-verity at least, but I don't find
> WQ_HIGHPRI mark for dm-verity.
>
> Thanks,
> Gao Xiang

I totally understand what you are worried about. However, in the real
world, non-determinism from workqueues is more harsh than we expected.
As you know, reading I/Os in the system are critical paths most of the
time and now I/O variations with workqueue are too bad.

I also think it's better that we have RT scheduling like things here.
We could think about it more.

Thanks,

2022-06-14 17:21:43

by Gao Xiang

[permalink] [raw]

Subject: Re: [f2fs-dev] [PATCH] f2fs: handle decompress only post processing in softirq

Hi Daeho,

On Tue, Jun 14, 2022 at 09:46:50AM -0700, Daeho Jeong wrote:
> >
> > Some my own previous thoughts about this strategy:
> >
> > - If we allocate all memory and map these before I/Os, all inflight I/Os
> > will keep such temporary pages all the time until decompression is
> > finished. In contrast, if we allocate or reuse such pages just before
> > decompression, it would minimize the memory footprints.
> >
> > I think it will impact the memory numbers at least on the very
> > low-ended devices with bslow storage. (I've seen f2fs has some big
> > mempool already)
> >
> > - Many compression algorithms are not suitable in the softirq contexts,
> > also I vaguely remembered if softirq context lasts for > 2ms, it will
> > push into ksoftirqd instead so it's actually another process context.
> > And it may delay other important interrupt handling.
> >
> > - Go back to the non-deterministic scheduling of workqueues. I guess it
> > may be just due to scheduling punishment due to a lot of CPU consuming
> > due to decompression before so the priority becomes low, but that is
> > just a pure guess. May be we need to use RT scheduling policy instead.
> >
> > At least with WQ_HIGHPRI for dm-verity at least, but I don't find
> > WQ_HIGHPRI mark for dm-verity.
> >
> > Thanks,
> > Gao Xiang
>
> I totally understand what you are worried about. However, in the real
> world, non-determinism from workqueues is more harsh than we expected.
> As you know, reading I/Os in the system are critical paths most of the
> time and now I/O variations with workqueue are too bad.
>
> I also think it's better that we have RT scheduling like things here.
> We could think about it more.

Yeah, I heard that you folks are really suffered from the scheduling
issues. But for my own previous experience, extra memory footprints are
really critical in Android low memory scenarios (no matter low-ended
devices or artificial workloads), it tossed me a lot. So I finally
ntroduced many inplace I/O to handle/minimize that, including inplace
I/O for compressed pages and temporary pages.

But I'm not quite sure what's currently happening now, since we once
didn't have such non-deterministic workqueues, and I don't hear from
other landed vendors. I think it'd be better to analyse what's going
on for these kworkers from scheduling POV and why they don't schedule
in time.

I also have an idea is much like what I'm doing now for sync
decompression, is that just before lock page and ->read_folio, we can
trigger some decompression in addition to kworker decompression, but it
needs some MM modification, as below:

!PageUptodate(page)

some callback to decompress in addition to kworker

lock_page()
->read_folio()

If mm folks don't like it, I think RT thread is also fine after we
analysed the root cause of the kworker delay I think.

Thanks,
Gao Xiang

>
> Thanks,

2022-06-14 18:03:31

by Daeho Jeong

[permalink] [raw]

Subject: Re: [f2fs-dev] [PATCH] f2fs: handle decompress only post processing in softirq

> Yeah, I heard that you folks are really suffered from the scheduling
> issues. But for my own previous experience, extra memory footprints are
> really critical in Android low memory scenarios (no matter low-ended
> devices or artificial workloads), it tossed me a lot. So I finally
> ntroduced many inplace I/O to handle/minimize that, including inplace
> I/O for compressed pages and temporary pages.
>
> But I'm not quite sure what's currently happening now, since we once
> didn't have such non-deterministic workqueues, and I don't hear from
> other landed vendors. I think it'd be better to analyse what's going
> on for these kworkers from scheduling POV and why they don't schedule
> in time.
>
> I also have an idea is much like what I'm doing now for sync
> decompression, is that just before lock page and ->read_folio, we can
> trigger some decompression in addition to kworker decompression, but it
> needs some MM modification, as below:
>
> !PageUptodate(page)
>
> some callback to decompress in addition to kworker
>
> lock_page()
> ->read_folio()
>
> If mm folks don't like it, I think RT thread is also fine after we
> analysed the root cause of the kworker delay I think.
>
> Thanks,
> Gao Xiang
>
> >
> > Thanks,

I don't think this is not a problem with most devices, since the
allocated memory is not too big and it'll be kept just as long as I/O
processing is on. However, I still understand what you're worried
about, so I think I can make a new mount option like "memory=low",
which can be used to give a hint to F2FS to have a priority on as
little memory as possible. In this mode, we will try to keep minimal
memory and we can use the previous implementation for decompression.

Thanks,

2022-06-14 18:26:43

by Gao Xiang

[permalink] [raw]

Subject: Re: [f2fs-dev] [PATCH] f2fs: handle decompress only post processing in softirq

On Tue, Jun 14, 2022 at 10:49:37AM -0700, Daeho Jeong wrote:
> > Yeah, I heard that you folks are really suffered from the scheduling
> > issues. But for my own previous experience, extra memory footprints are
> > really critical in Android low memory scenarios (no matter low-ended
> > devices or artificial workloads), it tossed me a lot. So I finally
> > ntroduced many inplace I/O to handle/minimize that, including inplace
> > I/O for compressed pages and temporary pages.
> >
> > But I'm not quite sure what's currently happening now, since we once
> > didn't have such non-deterministic workqueues, and I don't hear from
> > other landed vendors. I think it'd be better to analyse what's going
> > on for these kworkers from scheduling POV and why they don't schedule
> > in time.
> >
> > I also have an idea is much like what I'm doing now for sync
> > decompression, is that just before lock page and ->read_folio, we can
> > trigger some decompression in addition to kworker decompression, but it
> > needs some MM modification, as below:
> >
> > !PageUptodate(page)
> >
> > some callback to decompress in addition to kworker
> >
> > lock_page()
> > ->read_folio()
> >
> > If mm folks don't like it, I think RT thread is also fine after we
> > analysed the root cause of the kworker delay I think.
> >
> > Thanks,
> > Gao Xiang
> >
> > >
> > > Thanks,
>
> I don't think this is not a problem with most devices, since the
> allocated memory is not too big and it'll be kept just as long as I/O
> processing is on. However, I still understand what you're worried
> about, so I think I can make a new mount option like "memory=low",
> which can be used to give a hint to F2FS to have a priority on as
> little memory as possible. In this mode, we will try to keep minimal
> memory and we can use the previous implementation for decompression.

Okay, one of our previous tests was that how many applications are
still there after many other applications boot. That makes sense since
most users need to leave as many apps as possible, I know for now we
have swap-like thing, but once it was done without swap. If you reserve
too much memory (with page mempool or even for inflight I/O), it will
impact the alive app numbers compared to uncompressed cases for all
devices (even high-ended devices).

BTW, most crypto algorithms have hardware instructions to boost up,
actually we have some in-house neon lz4 assembly as well. but it still
somewhat slow than crypto algorithms, not to mention some algorithms
like zstd or lzma. Anyway, I personally prefer RT Thread way since it's
more flexible, also for dm-verity at least try with WQ_HIGHPRI, and I've
seen:
https://android-review.googlesource.com/c/kernel/common/+/204421

But I'm not sure why it wasn't upstreamed though.

Thanks,
Gao Xiang

>
> Thanks,