Using per-cpu thread pool we can reduce the scheduling latency compared
to workqueue implementation. With this patch scheduling latency and
variation is reduced as per-cpu threads are SCHED_FIFO kthread_workers.
The results were evaluated on arm64 Android devices running 5.10 kernel.
The table below shows resulting improvements of total scheduling latency
for the same app launch benchmark runs with 50 iterations. Scheduling
latency is the latency between when the task (workqueue kworker vs
kthread_worker) became eligible to run to when it actually started
running.
+-------------------------+-----------+----------------+---------+
| | workqueue | kthread_worker | diff |
+-------------------------+-----------+----------------+---------+
| Average (us) | 15253 | 2914 | -80.89% |
| Median (us) | 14001 | 2912 | -79.20% |
| Minimum (us) | 3117 | 1027 | -67.05% |
| Maximum (us) | 30170 | 3805 | -87.39% |
| Standard deviation (us) | 7166 | 359 | |
+-------------------------+-----------+----------------+---------+
Signed-off-by: Sandeep Dhavale <[email protected]>
---
fs/erofs/zdata.c | 84 +++++++++++++++++++++++++++++++++++-------------
fs/erofs/zdata.h | 4 ++-
2 files changed, 64 insertions(+), 24 deletions(-)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ccf7c55d477f..646667dbe615 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -8,6 +8,7 @@
#include "compress.h"
#include <linux/prefetch.h>
#include <linux/psi.h>
+#include <linux/slab.h>
#include <trace/events/erofs.h>
@@ -184,26 +185,56 @@ typedef tagptr1_t compressed_page_t;
#define tag_compressed_page_justfound(page) \
tagptr_fold(compressed_page_t, page, 1)
-static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kthread_worker **z_erofs_kthread_pool;
-void z_erofs_exit_zip_subsystem(void)
+static void z_erofs_destroy_kthread_pool(void)
{
- destroy_workqueue(z_erofs_workqueue);
- z_erofs_destroy_pcluster_pool();
+ unsigned long cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (z_erofs_kthread_pool[cpu]) {
+ kthread_destroy_worker(z_erofs_kthread_pool[cpu]);
+ z_erofs_kthread_pool[cpu] = NULL;
+ }
+ }
+ kfree(z_erofs_kthread_pool);
}
-static inline int z_erofs_init_workqueue(void)
+static int z_erofs_create_kthread_workers(void)
{
- const unsigned int onlinecpus = num_possible_cpus();
+ unsigned long cpu;
+ struct kthread_worker *worker;
+
+ for_each_possible_cpu(cpu) {
+ worker = kthread_create_worker_on_cpu(cpu, 0, "z_erofs/%ld", cpu);
+ if (IS_ERR(worker)) {
+ z_erofs_destroy_kthread_pool();
+ return -ENOMEM;
+ }
+ sched_set_fifo(worker->task);
+ z_erofs_kthread_pool[cpu] = worker;
+ }
+ return 0;
+}
- /*
- * no need to spawn too many threads, limiting threads could minimum
- * scheduling overhead, perhaps per-CPU threads should be better?
- */
- z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
- WQ_UNBOUND | WQ_HIGHPRI,
- onlinecpus + onlinecpus / 4);
- return z_erofs_workqueue ? 0 : -ENOMEM;
+static int z_erofs_init_kthread_pool(void)
+{
+ int err;
+
+ z_erofs_kthread_pool = kcalloc(num_possible_cpus(),
+ sizeof(struct kthread_worker *), GFP_ATOMIC);
+ if (!z_erofs_kthread_pool)
+ return -ENOMEM;
+ err = z_erofs_create_kthread_workers();
+
+ return err;
+}
+
+
+void z_erofs_exit_zip_subsystem(void)
+{
+ z_erofs_destroy_kthread_pool();
+ z_erofs_destroy_pcluster_pool();
}
int __init z_erofs_init_zip_subsystem(void)
@@ -211,10 +242,16 @@ int __init z_erofs_init_zip_subsystem(void)
int err = z_erofs_create_pcluster_pool();
if (err)
- return err;
- err = z_erofs_init_workqueue();
+ goto out_error_pcluster_pool;
+
+ err = z_erofs_init_kthread_pool();
if (err)
- z_erofs_destroy_pcluster_pool();
+ goto out_error_kthread_pool;
+
+ return err;
+out_error_kthread_pool:
+ z_erofs_destroy_pcluster_pool();
+out_error_pcluster_pool:
return err;
}
@@ -1143,7 +1180,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
}
}
-static void z_erofs_decompressqueue_work(struct work_struct *work)
+static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
@@ -1170,15 +1207,16 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use workqueue and sync decompression for atomic contexts only */
+ /* Use kthread_workers and sync decompression for atomic contexts only */
if (in_atomic() || irqs_disabled()) {
- queue_work(z_erofs_workqueue, &io->u.work);
+ kthread_queue_work(z_erofs_kthread_pool[raw_smp_processor_id()],
+ &io->u.work);
/* enable sync decompression for readahead */
if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
return;
}
- z_erofs_decompressqueue_work(&io->u.work);
+ z_erofs_decompressqueue_kthread_work(&io->u.work);
}
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
@@ -1306,7 +1344,7 @@ jobqueue_init(struct super_block *sb,
*fg = true;
goto fg_out;
}
- INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+ kthread_init_work(&q->u.work, z_erofs_decompressqueue_kthread_work);
} else {
fg_out:
q = fgq;
@@ -1500,7 +1538,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
/*
* although background is preferred, no one is pending for submission.
- * don't issue workqueue for decompression but drop it directly instead.
+ * don't issue kthread_work for decompression but drop it directly instead.
*/
if (!*force_fg && !nr_bios) {
kvfree(q[JQ_SUBMIT]);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index d98c95212985..808bbbf71b7b 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -6,6 +6,8 @@
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H
+#include <linux/kthread.h>
+
#include "internal.h"
#include "tagptr.h"
@@ -107,7 +109,7 @@ struct z_erofs_decompressqueue {
union {
struct completion done;
- struct work_struct work;
+ struct kthread_work work;
} u;
bool eio;
--
2.39.0.314.g84b9a713c41-goog
Hi Sandeen,
On Thu, Dec 22, 2022 at 01:15:29AM +0000, Sandeep Dhavale wrote:
> Using per-cpu thread pool we can reduce the scheduling latency compared
> to workqueue implementation. With this patch scheduling latency and
> variation is reduced as per-cpu threads are SCHED_FIFO kthread_workers.
>
> The results were evaluated on arm64 Android devices running 5.10 kernel.
>
> The table below shows resulting improvements of total scheduling latency
> for the same app launch benchmark runs with 50 iterations. Scheduling
> latency is the latency between when the task (workqueue kworker vs
> kthread_worker) became eligible to run to when it actually started
> running.
> +-------------------------+-----------+----------------+---------+
> | | workqueue | kthread_worker | diff |
> +-------------------------+-----------+----------------+---------+
> | Average (us) | 15253 | 2914 | -80.89% |
> | Median (us) | 14001 | 2912 | -79.20% |
> | Minimum (us) | 3117 | 1027 | -67.05% |
> | Maximum (us) | 30170 | 3805 | -87.39% |
> | Standard deviation (us) | 7166 | 359 | |
> +-------------------------+-----------+----------------+---------+
>
> Signed-off-by: Sandeep Dhavale <[email protected]>
Thanks for the patch. Generally, This path looks good to me (compared
with softirq context.)
With the background at LPC 22, I can see how important such low latency
requirement is needed for Android upstream and AOSP. However, could you
add some link or some brief background to other folks without such
impression?
> ---
> fs/erofs/zdata.c | 84 +++++++++++++++++++++++++++++++++++-------------
> fs/erofs/zdata.h | 4 ++-
> 2 files changed, 64 insertions(+), 24 deletions(-)
>
> diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> index ccf7c55d477f..646667dbe615 100644
> --- a/fs/erofs/zdata.c
> +++ b/fs/erofs/zdata.c
> @@ -8,6 +8,7 @@
> #include "compress.h"
> #include <linux/prefetch.h>
> #include <linux/psi.h>
> +#include <linux/slab.h>
>
> #include <trace/events/erofs.h>
>
> @@ -184,26 +185,56 @@ typedef tagptr1_t compressed_page_t;
> #define tag_compressed_page_justfound(page) \
> tagptr_fold(compressed_page_t, page, 1)
>
> -static struct workqueue_struct *z_erofs_workqueue __read_mostly;
> +static struct kthread_worker **z_erofs_kthread_pool;
>
> -void z_erofs_exit_zip_subsystem(void)
> +static void z_erofs_destroy_kthread_pool(void)
> {
> - destroy_workqueue(z_erofs_workqueue);
> - z_erofs_destroy_pcluster_pool();
> + unsigned long cpu;
> +
> + for_each_possible_cpu(cpu) {
> + if (z_erofs_kthread_pool[cpu]) {
> + kthread_destroy_worker(z_erofs_kthread_pool[cpu]);
> + z_erofs_kthread_pool[cpu] = NULL;
> + }
> + }
> + kfree(z_erofs_kthread_pool);
> }
>
> -static inline int z_erofs_init_workqueue(void)
> +static int z_erofs_create_kthread_workers(void)
> {
> - const unsigned int onlinecpus = num_possible_cpus();
> + unsigned long cpu;
> + struct kthread_worker *worker;
> +
> + for_each_possible_cpu(cpu) {
> + worker = kthread_create_worker_on_cpu(cpu, 0, "z_erofs/%ld", cpu);
how about calling them as erofs_worker/%ld, since in the future they
can also be used for other uses (like verification or decryption).
> + if (IS_ERR(worker)) {
> + z_erofs_destroy_kthread_pool();
> + return -ENOMEM;
> + }
> + sched_set_fifo(worker->task);
Could we add some kernel configuration option to enable/disable this,
since I'm not sure if all users need RT threads.
> + z_erofs_kthread_pool[cpu] = worker;
> + }
> + return 0;
> +}
>
> - /*
> - * no need to spawn too many threads, limiting threads could minimum
> - * scheduling overhead, perhaps per-CPU threads should be better?
> - */
> - z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
> - WQ_UNBOUND | WQ_HIGHPRI,
> - onlinecpus + onlinecpus / 4);
> - return z_erofs_workqueue ? 0 : -ENOMEM;
> +static int z_erofs_init_kthread_pool(void)
> +{
> + int err;
> +
> + z_erofs_kthread_pool = kcalloc(num_possible_cpus(),
> + sizeof(struct kthread_worker *), GFP_ATOMIC);
> + if (!z_erofs_kthread_pool)
> + return -ENOMEM;
> + err = z_erofs_create_kthread_workers();
> +
> + return err;
> +}
> +
> +
> +void z_erofs_exit_zip_subsystem(void)
> +{
> + z_erofs_destroy_kthread_pool();
> + z_erofs_destroy_pcluster_pool();
> }
>
> int __init z_erofs_init_zip_subsystem(void)
> @@ -211,10 +242,16 @@ int __init z_erofs_init_zip_subsystem(void)
> int err = z_erofs_create_pcluster_pool();
>
> if (err)
> - return err;
> - err = z_erofs_init_workqueue();
> + goto out_error_pcluster_pool;
> +
> + err = z_erofs_init_kthread_pool();
> if (err)
> - z_erofs_destroy_pcluster_pool();
> + goto out_error_kthread_pool;
> +
> + return err;
> +out_error_kthread_pool:
> + z_erofs_destroy_pcluster_pool();
> +out_error_pcluster_pool:
> return err;
> }
>
> @@ -1143,7 +1180,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
> }
> }
>
> -static void z_erofs_decompressqueue_work(struct work_struct *work)
> +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
> {
> struct z_erofs_decompressqueue *bgq =
> container_of(work, struct z_erofs_decompressqueue, u.work);
> @@ -1170,15 +1207,16 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
>
> if (atomic_add_return(bios, &io->pending_bios))
> return;
> - /* Use workqueue and sync decompression for atomic contexts only */
> + /* Use kthread_workers and sync decompression for atomic contexts only */
> if (in_atomic() || irqs_disabled()) {
> - queue_work(z_erofs_workqueue, &io->u.work);
> + kthread_queue_work(z_erofs_kthread_pool[raw_smp_processor_id()],
> + &io->u.work);
Should we need to handle cpu online/offline as well?
Thanks,
Gao Xiang
On Wed, Dec 21, 2022 at 6:48 PM Gao Xiang <[email protected]> wrote:
>
> Hi Sandeen,
>
> On Thu, Dec 22, 2022 at 01:15:29AM +0000, Sandeep Dhavale wrote:
> > Using per-cpu thread pool we can reduce the scheduling latency compared
> > to workqueue implementation. With this patch scheduling latency and
> > variation is reduced as per-cpu threads are SCHED_FIFO kthread_workers.
> >
> > The results were evaluated on arm64 Android devices running 5.10 kernel.
> >
> > The table below shows resulting improvements of total scheduling latency
> > for the same app launch benchmark runs with 50 iterations. Scheduling
> > latency is the latency between when the task (workqueue kworker vs
> > kthread_worker) became eligible to run to when it actually started
> > running.
> > +-------------------------+-----------+----------------+---------+
> > | | workqueue | kthread_worker | diff |
> > +-------------------------+-----------+----------------+---------+
> > | Average (us) | 15253 | 2914 | -80.89% |
> > | Median (us) | 14001 | 2912 | -79.20% |
> > | Minimum (us) | 3117 | 1027 | -67.05% |
> > | Maximum (us) | 30170 | 3805 | -87.39% |
> > | Standard deviation (us) | 7166 | 359 | |
> > +-------------------------+-----------+----------------+---------+
> >
> > Signed-off-by: Sandeep Dhavale <[email protected]>
>
> Thanks for the patch. Generally, This path looks good to me (compared
> with softirq context.)
>
> With the background at LPC 22, I can see how important such low latency
> requirement is needed for Android upstream and AOSP. However, could you
> add some link or some brief background to other folks without such
> impression?
>
> > ---
> > fs/erofs/zdata.c | 84 +++++++++++++++++++++++++++++++++++-------------
> > fs/erofs/zdata.h | 4 ++-
> > 2 files changed, 64 insertions(+), 24 deletions(-)
> >
> > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
> > index ccf7c55d477f..646667dbe615 100644
> > --- a/fs/erofs/zdata.c
> > +++ b/fs/erofs/zdata.c
> > @@ -8,6 +8,7 @@
> > #include "compress.h"
> > #include <linux/prefetch.h>
> > #include <linux/psi.h>
> > +#include <linux/slab.h>
> >
> > #include <trace/events/erofs.h>
> >
> > @@ -184,26 +185,56 @@ typedef tagptr1_t compressed_page_t;
> > #define tag_compressed_page_justfound(page) \
> > tagptr_fold(compressed_page_t, page, 1)
> >
> > -static struct workqueue_struct *z_erofs_workqueue __read_mostly;
> > +static struct kthread_worker **z_erofs_kthread_pool;
> >
> > -void z_erofs_exit_zip_subsystem(void)
> > +static void z_erofs_destroy_kthread_pool(void)
> > {
> > - destroy_workqueue(z_erofs_workqueue);
> > - z_erofs_destroy_pcluster_pool();
> > + unsigned long cpu;
> > +
> > + for_each_possible_cpu(cpu) {
> > + if (z_erofs_kthread_pool[cpu]) {
> > + kthread_destroy_worker(z_erofs_kthread_pool[cpu]);
> > + z_erofs_kthread_pool[cpu] = NULL;
> > + }
> > + }
> > + kfree(z_erofs_kthread_pool);
> > }
> >
> > -static inline int z_erofs_init_workqueue(void)
> > +static int z_erofs_create_kthread_workers(void)
> > {
> > - const unsigned int onlinecpus = num_possible_cpus();
> > + unsigned long cpu;
> > + struct kthread_worker *worker;
> > +
> > + for_each_possible_cpu(cpu) {
> > + worker = kthread_create_worker_on_cpu(cpu, 0, "z_erofs/%ld", cpu);
>
> how about calling them as erofs_worker/%ld, since in the future they
> can also be used for other uses (like verification or decryption).
>
Sure, it makes sense.
> > + if (IS_ERR(worker)) {
> > + z_erofs_destroy_kthread_pool();
> > + return -ENOMEM;
> > + }
> > + sched_set_fifo(worker->task);
>
> Could we add some kernel configuration option to enable/disable this,
> since I'm not sure if all users need RT threads.
>
Ok, will do.
> > + z_erofs_kthread_pool[cpu] = worker;
> > + }
> > + return 0;
> > +}
> >
> > - /*
> > - * no need to spawn too many threads, limiting threads could minimum
> > - * scheduling overhead, perhaps per-CPU threads should be better?
> > - */
> > - z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
> > - WQ_UNBOUND | WQ_HIGHPRI,
> > - onlinecpus + onlinecpus / 4);
> > - return z_erofs_workqueue ? 0 : -ENOMEM;
> > +static int z_erofs_init_kthread_pool(void)
> > +{
> > + int err;
> > +
> > + z_erofs_kthread_pool = kcalloc(num_possible_cpus(),
> > + sizeof(struct kthread_worker *), GFP_ATOMIC);
> > + if (!z_erofs_kthread_pool)
> > + return -ENOMEM;
> > + err = z_erofs_create_kthread_workers();
> > +
> > + return err;
> > +}
> > +
> > +
> > +void z_erofs_exit_zip_subsystem(void)
> > +{
> > + z_erofs_destroy_kthread_pool();
> > + z_erofs_destroy_pcluster_pool();
> > }
> >
> > int __init z_erofs_init_zip_subsystem(void)
> > @@ -211,10 +242,16 @@ int __init z_erofs_init_zip_subsystem(void)
> > int err = z_erofs_create_pcluster_pool();
> >
> > if (err)
> > - return err;
> > - err = z_erofs_init_workqueue();
> > + goto out_error_pcluster_pool;
> > +
> > + err = z_erofs_init_kthread_pool();
> > if (err)
> > - z_erofs_destroy_pcluster_pool();
> > + goto out_error_kthread_pool;
> > +
> > + return err;
> > +out_error_kthread_pool:
> > + z_erofs_destroy_pcluster_pool();
> > +out_error_pcluster_pool:
> > return err;
> > }
> >
> > @@ -1143,7 +1180,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
> > }
> > }
> >
> > -static void z_erofs_decompressqueue_work(struct work_struct *work)
> > +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
> > {
> > struct z_erofs_decompressqueue *bgq =
> > container_of(work, struct z_erofs_decompressqueue, u.work);
> > @@ -1170,15 +1207,16 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
> >
> > if (atomic_add_return(bios, &io->pending_bios))
> > return;
> > - /* Use workqueue and sync decompression for atomic contexts only */
> > + /* Use kthread_workers and sync decompression for atomic contexts only */
> > if (in_atomic() || irqs_disabled()) {
> > - queue_work(z_erofs_workqueue, &io->u.work);
> > + kthread_queue_work(z_erofs_kthread_pool[raw_smp_processor_id()],
> > + &io->u.work);
>
> Should we need to handle cpu online/offline as well?
>
Ok, let me try to add cpuhp support. I will work on V2.
> Thanks,
> Gao Xiang
Thanks,
Sandeep.