This makes getcpu() ~1000 times faster, this is very useful when
implementing per-cpu buffers in userspace (to avoid cache line
bouncing). As an example lttng ust becomes ~30% faster.
The patch will break applications using TPIDRURW (which is context switched
since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
Preserve the user r/w register TPIDRURW on context switch and fork")) and
is therefore made configurable.
Signed-off-by: Fredrik Markstrom <[email protected]>
---
arch/arm/include/asm/tls.h | 8 +++++++-
arch/arm/kernel/entry-armv.S | 1 -
arch/arm/mm/Kconfig | 10 ++++++++++
arch/arm/vdso/Makefile | 3 +++
arch/arm/vdso/vdso.lds.S | 3 +++
arch/arm/vdso/vgetcpu.c | 34 ++++++++++++++++++++++++++++++++++
6 files changed, 57 insertions(+), 2 deletions(-)
create mode 100644 arch/arm/vdso/vgetcpu.c
diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 5f833f7..170fd76 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -10,10 +10,15 @@
.endm
.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+#ifdef CONFIG_VDSO_GETCPU
+ ldr \tpuser, [r2, #TI_CPU]
+#else
mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
+ ldr \tpuser, [r2, #TI_TP_VALUE + 4]
+ str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+#endif
mcr p15, 0, \tp, c13, c0, 3 @ set TLS register
mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register
- str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
.endm
.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
@@ -22,6 +27,7 @@
mov \tmp2, #0xffff0fff
tst \tmp1, #HWCAP_TLS @ hardware TLS available?
streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0
+ ldrne \tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg
mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register
mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register
mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9f157e7..4e1369a 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -787,7 +787,6 @@ ENTRY(__switch_to)
THUMB( str sp, [ip], #4 )
THUMB( str lr, [ip], #4 )
ldr r4, [r2, #TI_TP_VALUE]
- ldr r5, [r2, #TI_TP_VALUE + 4]
#ifdef CONFIG_CPU_USE_DOMAINS
mrc p15, 0, r6, c3, c0, 0 @ Get domain register
str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index c1799dd..f18334a 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -854,6 +854,16 @@ config VDSO
You must have glibc 2.22 or later for programs to seamlessly
take advantage of this.
+config VDSO_GETCPU
+ bool "Enable VDSO for getcpu"
+ depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M)
+ help
+ Say Y to make getcpu a VDSO (fast) call. This is useful if you
+ want to implement per cpu buffers to avoid cache line bouncing
+ in user mode.
+ This mechanism uses the TPIDRURW register so enabling it will break
+ applications using this register for it's own purpose.
+
config DMA_CACHE_RWFO
bool "Enable read/write for ownership DMA cache maintenance"
depends on CPU_V6K && SMP
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 59a8fa7..9f1ec51 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,6 +1,9 @@
hostprogs-y := vdsomunge
obj-vdso := vgettimeofday.o datapage.o
+#ifeq ($(CONFIG_VDSO_GETCPU),y)
+obj-vdso += vgetcpu.o
+#endif
# Build rules
targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
index 89ca89f..1af39fb 100644
--- a/arch/arm/vdso/vdso.lds.S
+++ b/arch/arm/vdso/vdso.lds.S
@@ -82,6 +82,9 @@ VERSION
global:
__vdso_clock_gettime;
__vdso_gettimeofday;
+#ifdef CONFIG_VDSO_GETCPU
+ __vdso_getcpu;
+#endif
local: *;
};
}
diff --git a/arch/arm/vdso/vgetcpu.c b/arch/arm/vdso/vgetcpu.c
new file mode 100644
index 0000000..1b710af
--- /dev/null
+++ b/arch/arm/vdso/vgetcpu.c
@@ -0,0 +1,34 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/compiler.h>
+#include <asm/topology.h>
+
+struct getcpu_cache;
+
+notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
+ struct getcpu_cache *tcache)
+{
+ unsigned long node_and_cpu;
+
+ asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
+
+ if (nodep)
+ *nodep = cpu_to_node(node_and_cpu >> 16);
+ if (cpup)
+ *cpup = node_and_cpu & 0xffffUL;
+
+ return 0;
+}
+
--
2.7.2
On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> This makes getcpu() ~1000 times faster, this is very useful when
> implementing per-cpu buffers in userspace (to avoid cache line
> bouncing). As an example lttng ust becomes ~30% faster.
>
> The patch will break applications using TPIDRURW (which is context switched
> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
It looks like you dropped the leading 'a' from the commit ID. For
everyone else's benefit, the full ID is:
a4780adeefd042482f624f5e0d577bf9cdcbb760
Please note that arm64 has done similar for compat tasks since commit:
d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
compat tasks")
> Preserve the user r/w register TPIDRURW on context switch and fork")) and
> is therefore made configurable.
As you note above, this is an ABI break and *will* break some existing
applications. That's generally a no-go.
This also leaves arm64's compat with the existing behaviour, differing
from arm.
I was under the impression that other mechanisms were being considered
for fast userspace access to per-cpu data structures, e.g. restartable
sequences. What is the state of those? Why is this better?
If getcpu() specifically is necessary, is there no other way to
implement it?
> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> + struct getcpu_cache *tcache)
> +{
> + unsigned long node_and_cpu;
> +
> + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> +
> + if (nodep)
> + *nodep = cpu_to_node(node_and_cpu >> 16);
> + if (cpup)
> + *cpup = node_and_cpu & 0xffffUL;
Given this is directly user-accessible, this format is a de-facto ABI,
even if it's not documented as such. Is this definitely the format you
want long-term?
Thanks,
Mark.
On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <[email protected]> wrote:
>
> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> > This makes getcpu() ~1000 times faster, this is very useful when
> > implementing per-cpu buffers in userspace (to avoid cache line
> > bouncing). As an example lttng ust becomes ~30% faster.
> >
> > The patch will break applications using TPIDRURW (which is context switched
> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>
> It looks like you dropped the leading 'a' from the commit ID. For
> everyone else's benefit, the full ID is:
>
> a4780adeefd042482f624f5e0d577bf9cdcbb760
Sorry for that and thanks for fixing it.
>
>
> Please note that arm64 has done similar for compat tasks since commit:
>
> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
> compat tasks")
>
> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
> > is therefore made configurable.
>
> As you note above, this is an ABI break and *will* break some existing
> applications. That's generally a no-go.
Ok, I wasn't sure this was considered an ABI (but I'm not entirely
surprised ;) ). The way I was
trying to defend the breakage was by reasoning that that if it was an
ABI we broke it both with a4780ad
and with 6a1c531, and since we don't break ABI:s, it can't be one.
But hey, I'm humble here and ready to back off.
>
> This also leaves arm64's compat with the existing behaviour, differing
> from arm.
>
> I was under the impression that other mechanisms were being considered
> for fast userspace access to per-cpu data structures, e.g. restartable
> sequences. What is the state of those? Why is this better?
>
> If getcpu() specifically is necessary, is there no other way to
> implement it?
If you are referring to the user space stuff can probably be
implemented other ways,
it's just convenient since the interface is there and it will speed up
stuff like lttng without
modifications (well, except glibc). It's also already implemented as a
vDSO on other
major architectures (like x86, x86_64, ppc32 and ppc64).
If you are referring to the implementation of the vdso call, there are
other possibilities, but
I haven't found any that doesn't introduce overhead in context switching.
But if TPIDRURW is definitely a no go, I can work on a patch that does
this with a thread notifier
and the vdso data page. Would that be a viable option ?
>
> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > + struct getcpu_cache *tcache)
> > +{
> > + unsigned long node_and_cpu;
> > +
> > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > +
> > + if (nodep)
> > + *nodep = cpu_to_node(node_and_cpu >> 16);
> > + if (cpup)
> > + *cpup = node_and_cpu & 0xffffUL;
>
> Given this is directly user-accessible, this format is a de-facto ABI,
> even if it's not documented as such. Is this definitely the format you
> want long-term?
Yes, this (the interface) is indeed the important part and therefore I
tried not to invent anything
on my own.
This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
how the getcpu(2) system call is documented.
/Fredrik
>
>
> Thanks,
> Mark.
The approach I suggested below with the vDSO data page will obviously
not work on smp, so suggestions are welcome.
/Fredrik
On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström
<[email protected]> wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <[email protected]> wrote:
>>
>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>> > This makes getcpu() ~1000 times faster, this is very useful when
>> > implementing per-cpu buffers in userspace (to avoid cache line
>> > bouncing). As an example lttng ust becomes ~30% faster.
>> >
>> > The patch will break applications using TPIDRURW (which is context switched
>> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>
>> It looks like you dropped the leading 'a' from the commit ID. For
>> everyone else's benefit, the full ID is:
>>
>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>
>
> Sorry for that and thanks for fixing it.
>
>>
>>
>> Please note that arm64 has done similar for compat tasks since commit:
>>
>> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>> compat tasks")
>>
>> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
>> > is therefore made configurable.
>>
>> As you note above, this is an ABI break and *will* break some existing
>> applications. That's generally a no-go.
>
>
> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
> surprised ;) ). The way I was
> trying to defend the breakage was by reasoning that that if it was an
> ABI we broke it both with a4780ad
> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>
> But hey, I'm humble here and ready to back off.
>
>>
>> This also leaves arm64's compat with the existing behaviour, differing
>> from arm.
>>
>> I was under the impression that other mechanisms were being considered
>> for fast userspace access to per-cpu data structures, e.g. restartable
>> sequences. What is the state of those? Why is this better?
>>
>> If getcpu() specifically is necessary, is there no other way to
>> implement it?
>
> If you are referring to the user space stuff can probably be
> implemented other ways,
> it's just convenient since the interface is there and it will speed up
> stuff like lttng without
> modifications (well, except glibc). It's also already implemented as a
> vDSO on other
> major architectures (like x86, x86_64, ppc32 and ppc64).
>
> If you are referring to the implementation of the vdso call, there are
> other possibilities, but
> I haven't found any that doesn't introduce overhead in context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does
> this with a thread notifier
> and the vdso data page. Would that be a viable option ?
>
>>
>> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>> > + struct getcpu_cache *tcache)
>> > +{
>> > + unsigned long node_and_cpu;
>> > +
>> > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>> > +
>> > + if (nodep)
>> > + *nodep = cpu_to_node(node_and_cpu >> 16);
>> > + if (cpup)
>> > + *cpup = node_and_cpu & 0xffffUL;
>>
>> Given this is directly user-accessible, this format is a de-facto ABI,
>> even if it's not documented as such. Is this definitely the format you
>> want long-term?
>
> Yes, this (the interface) is indeed the important part and therefore I
> tried not to invent anything
> on my own.
> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
> how the getcpu(2) system call is documented.
>
> /Fredrik
>
>
>>
>>
>> Thanks,
>> Mark.
--
/Fredrik
On 05/10/16 17:39, Fredrik Markström wrote:
> The approach I suggested below with the vDSO data page will obviously
> not work on smp, so suggestions are welcome.
Well, given that it's user-writeable, is there any reason an application
which cares couldn't simply run some per-cpu threads to call getcpu()
once and cache the result in TPIDRURW themselves? That would appear to
both raise no compatibility issues and work with existing kernels.
Robin.
> /Fredrik
>
>
> On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström
> <[email protected]> wrote:
>> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <[email protected]> wrote:
>>>
>>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>>>> This makes getcpu() ~1000 times faster, this is very useful when
>>>> implementing per-cpu buffers in userspace (to avoid cache line
>>>> bouncing). As an example lttng ust becomes ~30% faster.
>>>>
>>>> The patch will break applications using TPIDRURW (which is context switched
>>>> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>>
>>> It looks like you dropped the leading 'a' from the commit ID. For
>>> everyone else's benefit, the full ID is:
>>>
>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>>
>> Sorry for that and thanks for fixing it.
>>
>>>
>>>
>>> Please note that arm64 has done similar for compat tasks since commit:
>>>
>>> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>>> compat tasks")
>>>
>>>> Preserve the user r/w register TPIDRURW on context switch and fork")) and
>>>> is therefore made configurable.
>>>
>>> As you note above, this is an ABI break and *will* break some existing
>>> applications. That's generally a no-go.
>>
>>
>> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
>> surprised ;) ). The way I was
>> trying to defend the breakage was by reasoning that that if it was an
>> ABI we broke it both with a4780ad
>> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>>
>> But hey, I'm humble here and ready to back off.
>>
>>>
>>> This also leaves arm64's compat with the existing behaviour, differing
>>> from arm.
>>>
>>> I was under the impression that other mechanisms were being considered
>>> for fast userspace access to per-cpu data structures, e.g. restartable
>>> sequences. What is the state of those? Why is this better?
>>>
>>> If getcpu() specifically is necessary, is there no other way to
>>> implement it?
>>
>> If you are referring to the user space stuff can probably be
>> implemented other ways,
>> it's just convenient since the interface is there and it will speed up
>> stuff like lttng without
>> modifications (well, except glibc). It's also already implemented as a
>> vDSO on other
>> major architectures (like x86, x86_64, ppc32 and ppc64).
>>
>> If you are referring to the implementation of the vdso call, there are
>> other possibilities, but
>> I haven't found any that doesn't introduce overhead in context switching.
>>
>> But if TPIDRURW is definitely a no go, I can work on a patch that does
>> this with a thread notifier
>> and the vdso data page. Would that be a viable option ?
>>
>>>
>>>> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>>>> + struct getcpu_cache *tcache)
>>>> +{
>>>> + unsigned long node_and_cpu;
>>>> +
>>>> + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>>>> +
>>>> + if (nodep)
>>>> + *nodep = cpu_to_node(node_and_cpu >> 16);
>>>> + if (cpup)
>>>> + *cpup = node_and_cpu & 0xffffUL;
>>>
>>> Given this is directly user-accessible, this format is a de-facto ABI,
>>> even if it's not documented as such. Is this definitely the format you
>>> want long-term?
>>
>> Yes, this (the interface) is indeed the important part and therefore I
>> tried not to invent anything
>> on my own.
>> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
>> how the getcpu(2) system call is documented.
>>
>> /Fredrik
>>
>>
>>>
>>>
>>> Thanks,
>>> Mark.
>
>
>
On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> On 05/10/16 17:39, Fredrik Markstr?m wrote:
> > The approach I suggested below with the vDSO data page will obviously
> > not work on smp, so suggestions are welcome.
>
> Well, given that it's user-writeable, is there any reason an application
> which cares couldn't simply run some per-cpu threads to call getcpu()
> once and cache the result in TPIDRURW themselves? That would appear to
> both raise no compatibility issues and work with existing kernels.
There is - the contents of TPIDRURW is thread specific, and it moves
with the thread between CPU cores. So, if a thread was running on CPU0
when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
TPIDRURW would still contain 0.
I'm also not in favour of changing the TPIDRURW usage to be a storage
repository for the CPU number - it's far too specific a usage and seems
like a waste of hardware resources to solve one problem. As Mark says,
it's an ABI breaking change too, even if it is under a config option.
Take a moment to consider distro kernels: how should they set this
config option - should they enable it to get faster getcpu() or should
they disable it to retain existing compatibility to prevent userspace
breakage. Who can advise them to make the right decision? Kernel
developers can't, because the usage of this register is purely a
userspace issue right now, and kernels devs don't know what use it's
been put to.
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
On Wed, Oct 05, 2016 at 02:25:22PM +0200, Fredrik Markström wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <[email protected]> wrote:
> > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> The way I was trying to defend the breakage was by reasoning that that if it
> was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't
> break ABI:s, it can't be one.
Prior to commit 6a1c531, other programs and/or cpuidle could have corrupted
TPIDRURW to arbitrary values at any point in time, so it couldn't have been
relied upon.
Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and
a4780ad to detect context switches, but in practice they don't appear to have,
and we know of an established user relying on the current behaviour.
For better or worse, the current behaviour is ABI.
> > I was under the impression that other mechanisms were being considered
> > for fast userspace access to per-cpu data structures, e.g. restartable
> > sequences. What is the state of those? Why is this better?
> >
> > If getcpu() specifically is necessary, is there no other way to
> > implement it?
>
> If you are referring to the user space stuff can probably be implemented
> other ways, it's just convenient since the interface is there and it will
> speed up stuff like lttng without modifications (well, except glibc). It's
> also already implemented as a vDSO on other major architectures (like x86,
> x86_64, ppc32 and ppc64).
>
> If you are referring to the implementation of the vdso call, there are other
> possibilities, but I haven't found any that doesn't introduce overhead in
> context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does this
> with a thread notifier and the vdso data page. Would that be a viable option?
As pointed out, that won't work for SMP, but perhaps we can come up with
something that does.
> > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > > + struct getcpu_cache *tcache)
> > > +{
> > > + unsigned long node_and_cpu;
> > > +
> > > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > > +
> > > + if (nodep)
> > > + *nodep = cpu_to_node(node_and_cpu >> 16);
> > > + if (cpup)
> > > + *cpup = node_and_cpu & 0xffffUL;
> >
> > Given this is directly user-accessible, this format is a de-facto ABI,
> > even if it's not documented as such. Is this definitely the format you
> > want long-term?
>
> Yes, this (the interface) is indeed the important part and therefore I tried
> not to invent anything on my own. This is the interface used by ppc32,
> ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is
> documented.
I was referring to the value in TPIDRURW specifically. If an application
started reading that directly (rather than going through the vDSO), we wouldn't
be able to change the register value in future.
That's all moot, given we can't repurpose TPIDRURW.
Thanks,
Mark.
On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote:
> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <[email protected]> wrote:
> As far as I understand TPIDRURW isn't anything else then an architecture
> specific piece of tls since the last patch, possibly slightly faster then a
> "__thread u32 x;"
>
> The irony is that the two different ways it was handled earlier (not context
> switched or always set to zero on swap in) would have made it useful for this
> purpose.
The "not context switched" case was also arbitrarily corrupted, and could not
have been relied upon.
The zeroing case is similar to the restartable sequences design. So that's
probably worth looking into.
Thanks,
Mark.
On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.
You're sending mixed messages: in your previous message, you said:
Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
and a4780ad to detect context switches, but in practice they don't
appear to have, and we know of an established user relying on the
current behaviour.
For better or worse, the current behaviour is ABI.
Now you're suggesting that we could go back to the case where the
register is zeroed.
Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
need to be careful about how we change it. Eg, we _could_ introduce a
per-process flag which indicates that we want some other behaviour from
TPIDRURW such as zeroing it on context switches. The default would be
to preserve the existing behaviour as doing anything else breaks
existing programs. The problem there is finding an acceptable way to
control such a flag from userspace (eg, prctl, syscall, etc).
--
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.
On Wed, Oct 5, 2016 at 10:44 PM, Mark Rutland <[email protected]> wrote:
> On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote:
>> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <[email protected]> wrote:
>> As far as I understand TPIDRURW isn't anything else then an architecture
>> specific piece of tls since the last patch, possibly slightly faster then a
>> "__thread u32 x;"
>>
>> The irony is that the two different ways it was handled earlier (not context
>> switched or always set to zero on swap in) would have made it useful for this
>> purpose.
>
> The "not context switched" case was also arbitrarily corrupted, and could not
> have been relied upon.
Ok, I missed that, sorry !
>
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.
Ok, I'm starting to believe my best bet is to hope that those make it
into the kernel
eventually, until then I'll probably just go with a local solution.
/Fredrik
>
> Thanks,
> Mark.
On Wed, Oct 05, 2016 at 10:01:38PM +0100, Russell King - ARM Linux wrote:
> On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> > The zeroing case is similar to the restartable sequences design. So that's
> > probably worth looking into.
>
> You're sending mixed messages: in your previous message, you said:
>
> Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
> and a4780ad to detect context switches, but in practice they don't
> appear to have, and we know of an established user relying on the
> current behaviour.
>
> For better or worse, the current behaviour is ABI.
>
> Now you're suggesting that we could go back to the case where the
> register is zeroed.
Sorry; clumsy wording on my behalf.
I meant that functionality-wise, restartable sequences had similar behaviour to
the zeroing case (without touching TPIDRURW at all) and were probably worth
looking at. I did not intend to suggest that we should go pack to case where
TPIDRURW was zeroed.
> Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
> need to be careful about how we change it. Eg, we _could_ introduce a
> per-process flag which indicates that we want some other behaviour from
> TPIDRURW such as zeroing it on context switches. The default would be
> to preserve the existing behaviour as doing anything else breaks
> existing programs. The problem there is finding an acceptable way to
> control such a flag from userspace (eg, prctl, syscall, etc).
Sure. Something like that could work.
Thanks,
Mark.
Hi Fredrik,
[adding Mathieu -- background is getcpu() in userspace for arm]
On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markstr?m wrote:
> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <[email protected]
> > wrote:
> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> >> On 05/10/16 17:39, Fredrik Markstr?m wrote:
> >> > The approach I suggested below with the vDSO data page will obviously
> >> > not work on smp, so suggestions are welcome.
> >>
> >> Well, given that it's user-writeable, is there any reason an application
> >> which cares couldn't simply run some per-cpu threads to call getcpu()
> >> once and cache the result in TPIDRURW themselves? That would appear to
> >> both raise no compatibility issues and work with existing kernels.
> >
> > There is - the contents of TPIDRURW is thread specific, and it moves
> > with the thread between CPU cores. So, if a thread was running on CPU0
> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
> > TPIDRURW would still contain 0.
> >
> > I'm also not in favour of changing the TPIDRURW usage to be a storage
> > repository for the CPU number - it's far too specific a usage and seems
> > like a waste of hardware resources to solve one problem.
>
> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
> which we have generic mechanisms for. From my point of view that is a waste of
> hardware resources.
>
> > As Mark says, it's an ABI breaking change too, even if it is under a config
> option.
>
> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
> why anyone would use it over normal tls... but then again, people probably do.
>
> So in conclusion I agree and give up.
Rather than give up, you could take a look at the patches from Mathieu
Desnoyers, that tackle this in a very different way. It's also the reason
we've been holding off implementing an optimised getcpu in the arm64 vdso,
because it might all well be replaced by the new restartable sequences
approach:
http://lkml.kernel.org/r/[email protected]
He's also got support for arch/arm/ in that series, so you could take
them for a spin. The main thing missing at the moment is justification
for the feature using real-world code, as requested by Linus:
http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com
so if your per-cpu buffer use-case is compelling in its own right (as
opposed to a micro-benchmark), then you could chime in over there.
Will
----- On Oct 10, 2016, at 5:29 PM, Will Deacon [email protected] wrote:
> Hi Fredrik,
>
> [adding Mathieu -- background is getcpu() in userspace for arm]
>
> On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markström wrote:
>> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <[email protected]
>> > wrote:
>> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
>> >> On 05/10/16 17:39, Fredrik Markström wrote:
>> >> > The approach I suggested below with the vDSO data page will obviously
>> >> > not work on smp, so suggestions are welcome.
>> >>
>> >> Well, given that it's user-writeable, is there any reason an application
>> >> which cares couldn't simply run some per-cpu threads to call getcpu()
>> >> once and cache the result in TPIDRURW themselves? That would appear to
>> >> both raise no compatibility issues and work with existing kernels.
>> >
>> > There is - the contents of TPIDRURW is thread specific, and it moves
>> > with the thread between CPU cores. So, if a thread was running on CPU0
>> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
>> > TPIDRURW would still contain 0.
>> >
>> > I'm also not in favour of changing the TPIDRURW usage to be a storage
>> > repository for the CPU number - it's far too specific a usage and seems
>> > like a waste of hardware resources to solve one problem.
>>
>> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
>> which we have generic mechanisms for. From my point of view that is a waste of
>> hardware resources.
>>
>> > As Mark says, it's an ABI breaking change too, even if it is under a config
>> option.
>>
>> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
>> why anyone would use it over normal tls... but then again, people probably do.
>>
>> So in conclusion I agree and give up.
>
> Rather than give up, you could take a look at the patches from Mathieu
> Desnoyers, that tackle this in a very different way. It's also the reason
> we've been holding off implementing an optimised getcpu in the arm64 vdso,
> because it might all well be replaced by the new restartable sequences
> approach:
>
> http://lkml.kernel.org/r/[email protected]
>
> He's also got support for arch/arm/ in that series, so you could take
> them for a spin. The main thing missing at the moment is justification
> for the feature using real-world code, as requested by Linus:
>
> http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com
>
> so if your per-cpu buffer use-case is compelling in its own right (as
> opposed to a micro-benchmark), then you could chime in over there.
>
> Will
FYI, I've adapted lttng-ust ring buffer (as a POC) to rseq in a dev
branch. I see interesting speedups. See top 3-4 commits of
https://github.com/compudj/lttng-ust-dev/tree/rseq-integration
(start with "Use rseq for...").
On x86-64, we have a 7ns speedup over sched_getcpu on x86-64, and
30ns speedup by using rseq atomics on x86-64, which brings the cost
per event record down to about 100ns/event. This replaces 3 atomic
operations on the fast path. (37% speedup)
On arm32, the cpu_id acceleration gives a 327 ns/event speed increase,
which brings speed to 2000ns/event. Note that reading time on that
system does not use the vDSO (old glibc), so it implies a system call.
This accounts for 857ns/events. I don't observe speed increase nor
slowdown by using rseq instead of ll/sc atomic operations on that
specific board (Cubietruck, only has 2 cores). I suspect that boards
with more core will benefit more of replacing ll/sc by rseq atomics.
If we don't account the overhead of reading time through system call,
we get a 22% speedup.
I have extra benchmarks in this branch:
https://github.com/compudj/rseq-test
Updated ref for current rseq-enabled kernel based on 4.8:
https://github.com/compudj/linux-percpu-dev/tree/rseq-fallback
(ARM64 port would be welcome!) :)
As Will pointed out, what I'm currently looking for is real-life
benchmarks that shows benefits of rseq. I fear that the microbenchmarks
I have for the lttng-ust tracer may be dismissed as being too specific.
Most heavy users of LTTng-UST are closed source applications, so it's
not easy for me to provide numbers in real-life scenarios.
The major use-case besides per-cpu buffering/tracing AFAIU is memory
allocators. It will mainly benefit in use-cases where there are more
threads than cores in a multithreaded application. This mainly makes
sense if threads are either dedicated to specific tasks, and therefore
are often idle, or in use-cases where worker threads are expected to
block (else, if threads are not expected to block, the application
should simply have one thread per core).
Dave Watson had interesting RSS shrinkage on this stress-test program:
http://locklessinc.com/downloads/t-test1.c modified to have 500 threads.
It uses jemalloc modified to use rseq.
I reproduced it on my laptop with 100 threads, 50000 loops:
4-core, 100 threads, 50000 loops.
malloc:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
10136 compudj 20 0 2857840 24756 1348 R 379.4 0.4 3:49.50 t-test1
real 3m20.830s
user 3m22.164s
sys 9m40.936s
upstream jemalloc:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
21234 compudj 20 0 2227124 49300 2280 S 306.2 0.7 2:26.97 t-test1
real 1m3.297s
user 3m19.616s
sys 0m8.500s
rseq jemalloc 4.2.1:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
25652 compudj 20 0 877956 35624 3260 S 301.2 0.5 1:26.07 t-test1
real 0m27.639s
user 1m18.172s
sys 0m1.752s
The next step to translate this into a "real-life" number would be to
run rseq-jemalloc on a facebook node, but Dave has been in vacation for
the past few weeks. Perhaps someone else at Facebook or Google could
look into this ?
Cheers,
Mathieu
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com