2023-01-06 03:42:50

by Sandeep Dhavale

[permalink] [raw]
Subject: [PATCH v2] erofs: replace erofs_unzipd workqueue with per-cpu threads

Using per-cpu thread pool we can reduce the scheduling latency compared
to workqueue implementation. With this patch scheduling latency and
variation is reduced as per-cpu threads are high priority kthread_workers.

The results were evaluated on arm64 Android devices running 5.10 kernel.

The table below shows resulting improvements of total scheduling latency
for the same app launch benchmark runs with 50 iterations. Scheduling
latency is the latency between when the task (workqueue kworker vs
kthread_worker) became eligible to run to when it actually started
running.
+-------------------------+-----------+----------------+---------+
| | workqueue | kthread_worker | diff |
+-------------------------+-----------+----------------+---------+
| Average (us) | 15253 | 2914 | -80.89% |
| Median (us) | 14001 | 2912 | -79.20% |
| Minimum (us) | 3117 | 1027 | -67.05% |
| Maximum (us) | 30170 | 3805 | -87.39% |
| Standard deviation (us) | 7166 | 359 | |
+-------------------------+-----------+----------------+---------+

Signed-off-by: Sandeep Dhavale <[email protected]>
---
V1 -> V2
* Changed name of kthread_workers from z_erofs to erofs_worker
* Added kernel configuration to run kthread_workers at normal or
high priority
* Added cpu hotplug support
* Added wrapped kthread_workers under worker_pool
* Added one unbound thread in a pool to handle a context where
we already stopped per-cpu kthread worker
* Updated commit message

Background: Boot times and cold app launch benchmarks are very
important to android ecosystem as they directly translate to
responsiveness from user point of view. While erofs provides
lot of important features like space savings, we saw some
performance penalty in cold app launch benchmarks in few scenarios.
Analysis showed that the significant variance was coming from the
scheduling cost while decompression cost was more or less the same.

Having per-cpu thread pool we can see from above table that this
variation is reduced by ~80% on average. Link to LPC 2022 slides and
talk at [1]

[1] https://lpc.events/event/16/contributions/1338/
---
fs/erofs/Kconfig | 11 +++
fs/erofs/zdata.c | 201 +++++++++++++++++++++++++++++++++++++++++------
fs/erofs/zdata.h | 4 +-
3 files changed, 192 insertions(+), 24 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 85490370e0ca..879f493c6641 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -108,3 +108,14 @@ config EROFS_FS_ONDEMAND
read support.

If unsure, say N.
+
+config EROFS_FS_KTHREAD_HIPRI
+ bool "EROFS high priority percpu kthread workers"
+ depends on EROFS_FS
+ default n
+ help
+ EROFS uses per cpu kthread workers pool to carry out async work.
+ This permits EROFS to configure per cpu kthread workers to run
+ at higher priority.
+
+ If unsure, say N.
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ccf7c55d477f..5df2e1eb7f54 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -8,6 +8,8 @@
#include "compress.h"
#include <linux/prefetch.h>
#include <linux/psi.h>
+#include <linux/slab.h>
+#include <linux/cpuhotplug.h>

#include <trace/events/erofs.h>

@@ -184,26 +186,152 @@ typedef tagptr1_t compressed_page_t;
#define tag_compressed_page_justfound(page) \
tagptr_fold(compressed_page_t, page, 1)

-static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+struct erofs_kthread_worker_pool {
+ struct kthread_worker __rcu **workers;
+ struct kthread_worker *unbound_worker;
+};

-void z_erofs_exit_zip_subsystem(void)
+static struct erofs_kthread_worker_pool worker_pool;
+DEFINE_SPINLOCK(worker_pool_lock);
+
+void erofs_destroy_worker_pool(void)
{
- destroy_workqueue(z_erofs_workqueue);
- z_erofs_destroy_pcluster_pool();
+ unsigned int cpu;
+ struct kthread_worker *worker;
+
+ for_each_possible_cpu(cpu) {
+ worker = rcu_dereference_protected(
+ worker_pool.workers[cpu],
+ 1);
+ rcu_assign_pointer(worker_pool.workers[cpu], NULL);
+
+ if (worker)
+ kthread_destroy_worker(worker);
+ }
+
+ if (worker_pool.unbound_worker)
+ kthread_destroy_worker(worker_pool.unbound_worker);
+
+ kfree(worker_pool.workers);
}

-static inline int z_erofs_init_workqueue(void)
+static inline void erofs_set_worker_priority(struct kthread_worker *worker)
{
- const unsigned int onlinecpus = num_possible_cpus();
+#ifdef CONFIG_EROFS_FS_KTHREAD_HIPRI
+ sched_set_fifo_low(worker->task);
+#else
+ sched_set_normal(worker->task, 0);
+#endif
+}

- /*
- * no need to spawn too many threads, limiting threads could minimum
- * scheduling overhead, perhaps per-CPU threads should be better?
- */
- z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
- WQ_UNBOUND | WQ_HIGHPRI,
- onlinecpus + onlinecpus / 4);
- return z_erofs_workqueue ? 0 : -ENOMEM;
+static int erofs_create_kthread_workers(void)
+{
+ unsigned int cpu;
+ struct kthread_worker *worker;
+
+ for_each_online_cpu(cpu) {
+ worker = kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+ if (IS_ERR(worker)) {
+ erofs_destroy_worker_pool();
+ return -ENOMEM;
+ }
+ erofs_set_worker_priority(worker);
+ rcu_assign_pointer(worker_pool.workers[cpu], worker);
+ }
+
+ worker = kthread_create_worker(0, "erofs_unbound_worker");
+ if (IS_ERR(worker)) {
+ erofs_destroy_worker_pool();
+ return PTR_ERR(worker);
+ }
+ erofs_set_worker_priority(worker);
+ worker_pool.unbound_worker = worker;
+
+ return 0;
+}
+
+static int erofs_init_worker_pool(void)
+{
+ int err;
+
+ worker_pool.workers = kcalloc(num_possible_cpus(),
+ sizeof(struct kthread_worker *), GFP_ATOMIC);
+ if (!worker_pool.workers)
+ return -ENOMEM;
+ err = erofs_create_kthread_workers();
+
+ return err;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static enum cpuhp_state erofs_cpuhp_state;
+static int erofs_cpu_online(unsigned int cpu)
+{
+ struct kthread_worker *worker;
+
+ worker = kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
+ if (IS_ERR(worker))
+ return -ENOMEM;
+
+ erofs_set_worker_priority(worker);
+
+ spin_lock(&worker_pool_lock);
+ rcu_assign_pointer(worker_pool.workers[cpu], worker);
+ spin_unlock(&worker_pool_lock);
+
+ synchronize_rcu();
+
+ return 0;
+}
+
+static int erofs_cpu_offline(unsigned int cpu)
+{
+ struct kthread_worker *worker;
+
+ spin_lock(&worker_pool_lock);
+ worker = rcu_dereference_protected(worker_pool.workers[cpu],
+ lockdep_is_held(&worker_pool_lock));
+ rcu_assign_pointer(worker_pool.workers[cpu], NULL);
+ spin_unlock(&worker_pool_lock);
+
+ synchronize_rcu();
+
+ if (worker)
+ kthread_destroy_worker(worker);
+
+ return 0;
+}
+
+static int erofs_cpu_hotplug_init(void)
+{
+ int state;
+
+ state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "fs/erofs:online",
+ erofs_cpu_online, erofs_cpu_offline);
+ if (state < 0)
+ return state;
+
+ erofs_cpuhp_state = state;
+
+ return 0;
+}
+
+static void erofs_cpu_hotplug_destroy(void)
+{
+ if (erofs_cpuhp_state)
+ cpuhp_remove_state_nocalls(erofs_cpuhp_state);
+}
+#else /* !CONFIG_HOTPLUG_CPU */
+static inline int erofs_cpu_hotplug_init(void) { return 0; }
+static inline void erofs_cpu_hotplug_destroy(void) {}
+#endif
+
+void z_erofs_exit_zip_subsystem(void)
+{
+ erofs_cpu_hotplug_destroy();
+ erofs_destroy_worker_pool();
+ z_erofs_destroy_pcluster_pool();
}

int __init z_erofs_init_zip_subsystem(void)
@@ -211,10 +339,23 @@ int __init z_erofs_init_zip_subsystem(void)
int err = z_erofs_create_pcluster_pool();

if (err)
- return err;
- err = z_erofs_init_workqueue();
+ goto out_error_pcluster_pool;
+
+ err = erofs_init_worker_pool();
if (err)
- z_erofs_destroy_pcluster_pool();
+ goto out_error_worker_pool;
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0)
+ goto out_error_cpuhp_init;
+
+ return err;
+
+out_error_cpuhp_init:
+ erofs_destroy_worker_pool();
+out_error_worker_pool:
+ z_erofs_destroy_pcluster_pool();
+out_error_pcluster_pool:
return err;
}

@@ -1143,7 +1284,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
}
}

-static void z_erofs_decompressqueue_work(struct work_struct *work)
+static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
@@ -1156,6 +1297,20 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
kvfree(bgq);
}

+static void erofs_schedule_kthread_work(struct kthread_work *work)
+{
+ struct kthread_worker *worker;
+ unsigned int cpu = raw_smp_processor_id();
+
+ rcu_read_lock();
+ worker = rcu_dereference(worker_pool.workers[cpu]);
+ if (!worker)
+ worker = worker_pool.unbound_worker;
+
+ kthread_queue_work(worker, work);
+ rcu_read_unlock();
+}
+
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
bool sync, int bios)
{
@@ -1170,15 +1325,15 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,

if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use workqueue and sync decompression for atomic contexts only */
+ /* Use kthread_workers and sync decompression for atomic contexts only */
if (in_atomic() || irqs_disabled()) {
- queue_work(z_erofs_workqueue, &io->u.work);
+ erofs_schedule_kthread_work(&io->u.work);
/* enable sync decompression for readahead */
if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
return;
}
- z_erofs_decompressqueue_work(&io->u.work);
+ z_erofs_decompressqueue_kthread_work(&io->u.work);
}

static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
@@ -1306,7 +1461,7 @@ jobqueue_init(struct super_block *sb,
*fg = true;
goto fg_out;
}
- INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+ kthread_init_work(&q->u.work, z_erofs_decompressqueue_kthread_work);
} else {
fg_out:
q = fgq;
@@ -1500,7 +1655,7 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,

/*
* although background is preferred, no one is pending for submission.
- * don't issue workqueue for decompression but drop it directly instead.
+ * don't issue kthread_work for decompression but drop it directly instead.
*/
if (!*force_fg && !nr_bios) {
kvfree(q[JQ_SUBMIT]);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index d98c95212985..808bbbf71b7b 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -6,6 +6,8 @@
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H

+#include <linux/kthread.h>
+
#include "internal.h"
#include "tagptr.h"

@@ -107,7 +109,7 @@ struct z_erofs_decompressqueue {

union {
struct completion done;
- struct work_struct work;
+ struct kthread_work work;
} u;

bool eio;
--
2.39.0.314.g84b9a713c41-goog


2023-01-06 04:54:27

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] erofs: replace erofs_unzipd workqueue with per-cpu threads

Hi Sandeep,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on xiang-erofs/dev-test]
[also build test WARNING on xiang-erofs/dev xiang-erofs/fixes linus/master v6.2-rc2 next-20230105]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Sandeep-Dhavale/erofs-replace-erofs_unzipd-workqueue-with-per-cpu-threads/20230106-110024
base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
patch link: https://lore.kernel.org/r/20230106025719.3870252-1-dhavale%40google.com
patch subject: [PATCH v2] erofs: replace erofs_unzipd workqueue with per-cpu threads
config: x86_64-randconfig-a003
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/805027b1ff1976adf5fca411d328a30cea6e4cd6
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Sandeep-Dhavale/erofs-replace-erofs_unzipd-workqueue-with-per-cpu-threads/20230106-110024
git checkout 805027b1ff1976adf5fca411d328a30cea6e4cd6
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash fs/erofs/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> fs/erofs/zdata.c:197:6: warning: no previous prototype for function 'erofs_destroy_worker_pool' [-Wmissing-prototypes]
void erofs_destroy_worker_pool(void)
^
fs/erofs/zdata.c:197:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
void erofs_destroy_worker_pool(void)
^
static
1 warning generated.


vim +/erofs_destroy_worker_pool +197 fs/erofs/zdata.c

196
> 197 void erofs_destroy_worker_pool(void)
198 {
199 unsigned int cpu;
200 struct kthread_worker *worker;
201
202 for_each_possible_cpu(cpu) {
203 worker = rcu_dereference_protected(
204 worker_pool.workers[cpu],
205 1);
206 rcu_assign_pointer(worker_pool.workers[cpu], NULL);
207
208 if (worker)
209 kthread_destroy_worker(worker);
210 }
211
212 if (worker_pool.unbound_worker)
213 kthread_destroy_worker(worker_pool.unbound_worker);
214
215 kfree(worker_pool.workers);
216 }
217

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests


Attachments:
(No filename) (3.06 kB)
config (156.89 kB)
Download all attachments

2023-01-06 04:55:57

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v2] erofs: replace erofs_unzipd workqueue with per-cpu threads

Hi Sandeep,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on xiang-erofs/dev-test]
[also build test WARNING on xiang-erofs/dev xiang-erofs/fixes linus/master v6.2-rc2 next-20230105]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Sandeep-Dhavale/erofs-replace-erofs_unzipd-workqueue-with-per-cpu-threads/20230106-110024
base: https://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git dev-test
patch link: https://lore.kernel.org/r/20230106025719.3870252-1-dhavale%40google.com
patch subject: [PATCH v2] erofs: replace erofs_unzipd workqueue with per-cpu threads
config: m68k-allmodconfig
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/805027b1ff1976adf5fca411d328a30cea6e4cd6
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Sandeep-Dhavale/erofs-replace-erofs_unzipd-workqueue-with-per-cpu-threads/20230106-110024
git checkout 805027b1ff1976adf5fca411d328a30cea6e4cd6
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash fs/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

>> fs/erofs/zdata.c:197:6: warning: no previous prototype for 'erofs_destroy_worker_pool' [-Wmissing-prototypes]
197 | void erofs_destroy_worker_pool(void)
| ^~~~~~~~~~~~~~~~~~~~~~~~~


vim +/erofs_destroy_worker_pool +197 fs/erofs/zdata.c

196
> 197 void erofs_destroy_worker_pool(void)
198 {
199 unsigned int cpu;
200 struct kthread_worker *worker;
201
202 for_each_possible_cpu(cpu) {
203 worker = rcu_dereference_protected(
204 worker_pool.workers[cpu],
205 1);
206 rcu_assign_pointer(worker_pool.workers[cpu], NULL);
207
208 if (worker)
209 kthread_destroy_worker(worker);
210 }
211
212 if (worker_pool.unbound_worker)
213 kthread_destroy_worker(worker_pool.unbound_worker);
214
215 kfree(worker_pool.workers);
216 }
217

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests


Attachments:
(No filename) (2.81 kB)
config (284.27 kB)
Download all attachments