This allows the pgtable tests to be built.
Signed-off-by: Emil Renner Berthing <[email protected]>
---
The tests seem to succeed both in Qemu and on the HiFive Unleashed
Both with and without the recent additions in
https://lore.kernel.org/linux-riscv/[email protected]/
Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +-
arch/riscv/Kconfig | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index c527d05c0459..c9a764c3c795 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -23,7 +23,7 @@
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
- | riscv: | TODO |
+ | riscv: | ok |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3230c1d48562..b4e674b1e857 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -15,6 +15,7 @@ config RISCV
select ARCH_CLOCKSOURCE_INIT
select ARCH_HAS_BINFMT_FLAT
select ARCH_HAS_DEBUG_VIRTUAL if MMU
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
--
2.27.0
On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
> This allows the pgtable tests to be built.
>
> Signed-off-by: Emil Renner Berthing <[email protected]>
> ---
>
> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>
> Both with and without the recent additions in
> https://lore.kernel.org/linux-riscv/[email protected]/
That's great, thanks for testing.
On Tue, 14 Jul 2020 20:20:54 PDT (-0700), [email protected] wrote:
>
>
> On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
>> This allows the pgtable tests to be built.
>>
>> Signed-off-by: Emil Renner Berthing <[email protected]>
>> ---
>>
>> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>>
>> Both with and without the recent additions in
>> https://lore.kernel.org/linux-riscv/[email protected]/
>
> That's great, thanks for testing.
Actually, looking at this I'm not sure it actually helps us any. This changes
the behavior of two functions. Pulling out the relevant sections, I see:
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
w = (w + (w >> 4)) & 0x0f0f0f0f;
return (w * 0x01010101) >> 24;
#else
unsigned int res = w - ((w >> 1) & 0x55555555);
res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
res = (res + (res >> 4)) & 0x0F0F0F0F;
res = res + (res >> 8);
return (res + (res >> 16)) & 0x000000FF;
#endif
}
and
unsigned long memchr_inv(unsigned long value64)
{
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
value64 *= 0x01010101;
value64 |= value64 << 32;
#else
value64 |= value64 << 8;
value64 |= value64 << 16;
value64 |= value64 << 32;
#endif
return value64;
}
GCC optimizer the multiplication out of the first one:
__sw_hweight32:
li a4,1431654400
srliw a5,a0,1
addi a4,a4,1365
and a5,a5,a4
subw a0,a0,a5
li a5,858992640
srliw a4,a0,2
addi a5,a5,819
and a0,a5,a0
and a5,a5,a4
addw a5,a0,a5
srliw a0,a5,4
addw a0,a0,a5
li a5,252645376
addi a5,a5,-241
and a5,a5,a0
srliw a0,a5,8
addw a5,a0,a5
srliw a0,a5,16
addw a0,a0,a5
andi a0,a0,0xff
ret
__sw_hweight32:
li a5,1431654400
srliw a4,a0,1
addi a5,a5,1365
and a5,a5,a4
subw a0,a0,a5
li a5,858992640
srliw a4,a0,2
addi a5,a5,819
and a0,a5,a0
and a5,a5,a4
addw a5,a0,a5
srliw a0,a5,4
addw a5,a0,a5
li a0,252645376
addi a0,a0,-241
and a5,a0,a5
slliw a0,a5,8
addw a0,a0,a5
slliw a5,a0,16
addw a0,a0,a5
srliw a0,a0,24
ret
so that doesn't matter. The second one is really a wash:
memchr_inv:
ld a5,.LC0
mul a0,a0,a5
ret
.rodata
.LC0:
.dword 72340172838076673
vs
memchr_inv:
slli a5,a0,8
or a5,a5,a0
slli a0,a5,16
or a0,a0,a5
slli a5,a0,32
or a0,a5,a0
ret
It's unlikely that load ends up relaxed, so it's going to be two instructions.
That means we have 4 cycles to forward the load and multiply, for a cache hit.
IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
imul as 10 cycles, but I remember it being more like 5 so that might just be an
architecture-inaccurate tuning in the generic pipeline model. This is out of
the inner loop, so it's probably not all that important anyway. The result
isn't used for a while so on a bigger machine it's probably worth picking the
smaller code path, but it seems like a very small thing to optimize for either
way.
I'm actually a bit surprised about this. Do you have benchmarks that indicate
ARCH_HAS_FAST_MULTIPLIER is actually faster? Otherwise I guess I'm going to
reject this, as it's really more
ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
ARCH_HAS_FAST_MULTIPLIER.
On Tue, 21 Jul 2020 at 06:04, Palmer Dabbelt <[email protected]> wrote:
>
> On Tue, 14 Jul 2020 20:20:54 PDT (-0700), [email protected] wrote:
> >
> >
> > On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
> >> This allows the pgtable tests to be built.
> >>
> >> Signed-off-by: Emil Renner Berthing <[email protected]>
> >> ---
> >>
> >> The tests seem to succeed both in Qemu and on the HiFive Unleashed
> >>
> >> Both with and without the recent additions in
> >> https://lore.kernel.org/linux-riscv/[email protected]/
> >
> > That's great, thanks for testing.
>
> Actually, looking at this I'm not sure it actually helps us any. This changes
> the behavior of two functions. Pulling out the relevant sections, I see:
>
> unsigned int __sw_hweight32(unsigned int w)
> {
> #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
> w -= (w >> 1) & 0x55555555;
> w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
> w = (w + (w >> 4)) & 0x0f0f0f0f;
> return (w * 0x01010101) >> 24;
> #else
> unsigned int res = w - ((w >> 1) & 0x55555555);
> res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
> res = (res + (res >> 4)) & 0x0F0F0F0F;
> res = res + (res >> 8);
> return (res + (res >> 16)) & 0x000000FF;
> #endif
> }
>
> and
>
> unsigned long memchr_inv(unsigned long value64)
> {
> #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
> value64 *= 0x0101010101010101ULL;
> #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
> value64 *= 0x01010101;
> value64 |= value64 << 32;
> #else
> value64 |= value64 << 8;
> value64 |= value64 << 16;
> value64 |= value64 << 32;
> #endif
> return value64;
> }
>
> GCC optimizer the multiplication out of the first one:
>
> __sw_hweight32:
> li a4,1431654400
> srliw a5,a0,1
> addi a4,a4,1365
> and a5,a5,a4
> subw a0,a0,a5
> li a5,858992640
> srliw a4,a0,2
> addi a5,a5,819
> and a0,a5,a0
> and a5,a5,a4
> addw a5,a0,a5
> srliw a0,a5,4
> addw a0,a0,a5
> li a5,252645376
> addi a5,a5,-241
> and a5,a5,a0
> srliw a0,a5,8
> addw a5,a0,a5
> srliw a0,a5,16
> addw a0,a0,a5
> andi a0,a0,0xff
> ret
>
> __sw_hweight32:
> li a5,1431654400
> srliw a4,a0,1
> addi a5,a5,1365
> and a5,a5,a4
> subw a0,a0,a5
> li a5,858992640
> srliw a4,a0,2
> addi a5,a5,819
> and a0,a5,a0
> and a5,a5,a4
> addw a5,a0,a5
> srliw a0,a5,4
> addw a5,a0,a5
> li a0,252645376
> addi a0,a0,-241
> and a5,a0,a5
> slliw a0,a5,8
> addw a0,a0,a5
> slliw a5,a0,16
> addw a0,a0,a5
> srliw a0,a0,24
> ret
>
> so that doesn't matter. The second one is really a wash:
>
> memchr_inv:
> ld a5,.LC0
> mul a0,a0,a5
> ret
> .rodata
> .LC0:
> .dword 72340172838076673
>
> vs
>
> memchr_inv:
> slli a5,a0,8
> or a5,a5,a0
> slli a0,a5,16
> or a0,a0,a5
> slli a5,a0,32
> or a0,a5,a0
> ret
>
> It's unlikely that load ends up relaxed, so it's going to be two instructions.
> That means we have 4 cycles to forward the load and multiply, for a cache hit.
> IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
> imul as 10 cycles, but I remember it being more like 5 so that might just be an
> architecture-inaccurate tuning in the generic pipeline model. This is out of
> the inner loop, so it's probably not all that important anyway. The result
> isn't used for a while so on a bigger machine it's probably worth picking the
> smaller code path, but it seems like a very small thing to optimize for either
> way.
>
> I'm actually a bit surprised about this. Do you have benchmarks that indicate
> ARCH_HAS_FAST_MULTIPLIER is actually faster? Otherwise I guess I'm going to
> reject this, as it's really more
> ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
> ARCH_HAS_FAST_MULTIPLIER.
Hi Palmer,
I think you meant this reply for
https://lore.kernel.org/linux-riscv/[email protected]/T/#t
/Emil
On Tue, 14 Jul 2020 14:26:11 PDT (-0700), [email protected] wrote:
> This allows the pgtable tests to be built.
>
> Signed-off-by: Emil Renner Berthing <[email protected]>
> ---
>
> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>
> Both with and without the recent additions in
> https://lore.kernel.org/linux-riscv/[email protected]/
>
> Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +-
> arch/riscv/Kconfig | 1 +
> 2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> index c527d05c0459..c9a764c3c795 100644
> --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> @@ -23,7 +23,7 @@
> | openrisc: | TODO |
> | parisc: | TODO |
> | powerpc: | ok |
> - | riscv: | TODO |
> + | riscv: | ok |
> | s390: | ok |
> | sh: | TODO |
> | sparc: | TODO |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3230c1d48562..b4e674b1e857 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -15,6 +15,7 @@ config RISCV
> select ARCH_CLOCKSOURCE_INIT
> select ARCH_HAS_BINFMT_FLAT
> select ARCH_HAS_DEBUG_VIRTUAL if MMU
> + select ARCH_HAS_DEBUG_VM_PGTABLE
> select ARCH_HAS_DEBUG_WX
> select ARCH_HAS_GCOV_PROFILE_ALL
> select ARCH_HAS_GIGANTIC_PAGE
This is on for-next. Thanks!