From: Ma Ling <[email protected]>
Wire-latency(RC delay) dominate modern computer performance,
conventional serialized works cause cache line ping-pong seriously,
the process spend lots of time and power to complete.
specially on multi-core platform.
However if the serialized works are sent to one core and executed
ONLY when contention happens, that can save much time and power,
because all shared data are located in private cache of one core.
We call the mechanism as Adaptive Lock Integration.
(ali workqueue)
The new code is based on qspinlock and implement Lock Integration,
when user space application cause the bottle neck from kernel spinlock
the new mechanism could improve performance up to 1.65x for
https://lkml.org/lkml/2016/2/4/48 or
http://lkml.iu.edu/hypermail/linux/kernel/1602.0/03745.html
and 2.79x for https://lkml.org/lkml/2016/4/4/848 respectively.
And additional changes on Makefile/Kconfig are made to enable compiling of
this feature on x86 platform.
Signed-off-by: Ma Ling <[email protected]>
---
The patch is based on https://lkml.org/lkml/2015/12/31/20,
in this version we append init function and fix function name.
arch/x86/Kconfig | 1 +
include/linux/aliworkqueue.h | 34 ++++++++++++++
kernel/Kconfig.locks | 7 +++
kernel/locking/Makefile | 1 +
kernel/locking/aliworkqueue.c | 97 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 140 insertions(+), 0 deletions(-)
create mode 100644 include/linux/aliworkqueue.h
create mode 100644 kernel/locking/aliworkqueue.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 328c835..f0a7df1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,6 +42,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_USE_ALI_WORKQUEUE
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
diff --git a/include/linux/aliworkqueue.h b/include/linux/aliworkqueue.h
new file mode 100644
index 0000000..3c8437b
--- /dev/null
+++ b/include/linux/aliworkqueue.h
@@ -0,0 +1,34 @@
+#ifndef ALI_WORKQUEUE_H
+#define ALI_WORKQUEUE_H
+/*
+ * Adaptive Lock Integration
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Alibaba Group.
+ *
+ * Authors: Ma Ling <[email protected]>
+ *
+ */
+typedef struct ali_workqueue {
+ void *wq;
+} ali_workqueue_t;
+
+struct ali_workqueue_info {
+ struct ali_workqueue_info *next;
+ int pending;
+ void (*fn)(void *);
+ void *para;
+};
+
+void ali_workqueue(struct ali_workqueue *ali_wq, struct ali_workqueue_info *ali);
+void ali_workqueue_init(struct ali_workqueue *ali_wq);
+#endif /* ALI_WORKQUEUE_H */
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb004..4edc186 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -235,6 +235,13 @@ config LOCK_SPIN_ON_OWNER
def_bool y
depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+config ARCH_USE_ALI_WORKQUEUE
+ bool
+
+config ALI_WORKQUEUE
+ def_bool y if ARCH_USE_ALI_WORKQUEUE
+ depends on SMP
+
config ARCH_USE_QUEUED_SPINLOCKS
bool
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6c..b7c1280 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
+obj-$(CONFIG_ALI_WORKQUEUE) += aliworkqueue.o
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
obj-$(CONFIG_SMP) += lglock.o
diff --git a/kernel/locking/aliworkqueue.c b/kernel/locking/aliworkqueue.c
new file mode 100644
index 0000000..fe4c88e
--- /dev/null
+++ b/kernel/locking/aliworkqueue.c
@@ -0,0 +1,97 @@
+/*
+ * Adaptive Lock Integration
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Alibaba Group.
+ *
+ * Authors: Ma Ling <[email protected]>
+ *
+ */
+#include <asm/processor.h>
+#include <asm/cmpxchg.h>
+#include <linux/aliworkqueue.h>
+/*
+ * Wire-latency(RC delay) dominate modern computer performance,
+ * conventional serialized works cause cache line ping-pong seriously,
+ * the process spend lots of time and power to complete.
+ * specially on multi-core latform.
+ *
+ * However if the serialized works are sent to one core and executed
+ * ONLY when contention happens, that can save much time and power,
+ * because all shared data are located in private cache of one core.
+ * We call the mechanism as Adaptive Lock Integration.
+ * (ali workqueue)
+ *
+ */
+void aliworkqueue(struct ali_workqueue *ali_wq, struct ali_workqueue_info *ali)
+{
+ struct ali_workqueue_info *next, *old;
+
+ ali->next = NULL;
+ ali->pending = 1;
+ old = xchg(&ali_wq->wq, ali);
+
+ /* If NULL we are the first one */
+ if (old) {
+ /*Append self into work queue */
+ WRITE_ONCE(old->next, ali);
+
+ /*Waiting until work complete */
+ while((READ_ONCE(ali->pending)))
+ cpu_relax_lowlatency();
+ return;
+ }
+ old = READ_ONCE(ali_wq->wq);
+
+ /* Handle all pending works */
+repeat:
+ if(old == ali)
+ goto end;
+
+ while (!(next = READ_ONCE(ali->next)))
+ cpu_relax_lowlatency();
+
+ ali->fn(ali->para);
+ ali->pending = 0;
+
+ if(old != next) {
+ while (!(ali = READ_ONCE(next->next)))
+ cpu_relax_lowlatency();
+ next->fn(next->para);
+ next->pending = 0;
+ goto repeat;
+
+ } else
+ ali = next;
+end:
+ ali->fn(ali->para);
+ /* If we are the last one, clear workqueue and return */
+ old = cmpxchg(&ali_wq->wq, old, 0);
+
+ if(old != ali) {
+ /* There are still some works to do */
+ while (!(next = READ_ONCE(ali->next)))
+ cpu_relax_lowlatency();
+ ali->pending = 0;
+ ali = next;
+ goto repeat;
+ }
+
+ ali->pending = 0;
+ return;
+}
+
+/* Init ali work queue */
+void ali_workqueue_init(struct ali_workqueue *ali_wq)
+{
+ WRITE_ONCE(ali_wq->wq, NULL);
+}
--
1.7.1
On 04/15/2016 12:05 AM, [email protected] wrote:
> From: Ma Ling<[email protected]>
>
> Wire-latency(RC delay) dominate modern computer performance,
> conventional serialized works cause cache line ping-pong seriously,
> the process spend lots of time and power to complete.
> specially on multi-core platform.
>
> However if the serialized works are sent to one core and executed
> ONLY when contention happens, that can save much time and power,
> because all shared data are located in private cache of one core.
> We call the mechanism as Adaptive Lock Integration.
> (ali workqueue)
>
> The new code is based on qspinlock and implement Lock Integration,
> when user space application cause the bottle neck from kernel spinlock
> the new mechanism could improve performance up to 1.65x for
> https://lkml.org/lkml/2016/2/4/48 or
> http://lkml.iu.edu/hypermail/linux/kernel/1602.0/03745.html
> and 2.79x for https://lkml.org/lkml/2016/4/4/848 respectively.
>
> And additional changes on Makefile/Kconfig are made to enable compiling of
> this feature on x86 platform.
>
> Signed-off-by: Ma Ling<[email protected]>
> ---
> The patch is based on https://lkml.org/lkml/2015/12/31/20,
> in this version we append init function and fix function name.
>
> arch/x86/Kconfig | 1 +
> include/linux/aliworkqueue.h | 34 ++++++++++++++
> kernel/Kconfig.locks | 7 +++
> kernel/locking/Makefile | 1 +
> kernel/locking/aliworkqueue.c | 97 +++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 140 insertions(+), 0 deletions(-)
> create mode 100644 include/linux/aliworkqueue.h
> create mode 100644 kernel/locking/aliworkqueue.c
>
>
As I said before, you need a use case within the kernel to demonstrate
its usefulness. The Linux kernel community will not accept code that
isn't used anywhere.
A major problem to convert regular locking code to using the
aliworkqueue is that it requires rather significant code changes. So you
really need a good use case where you can show the performance benefit
is much greater the cost of making the conversion.
Cheers,
Longman