Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
Date:   Mon, 20 Jul 2020 21:04:45 -0700 (PDT)
Subject:     Re: [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
In-Reply-To: <0d7d0c38-5b67-1793-47d7-b8a7714838ee@arm.com>
CC:     kernel@esmil.dk, linux-riscv@lists.infradead.org,
        Paul Walmsley <paul.walmsley@sifive.com>,
        linux-kernel@vger.kernel.org
From:   Palmer Dabbelt <palmer@dabbelt.com>
To:     anshuman.khandual@arm.com
Message-ID: <mhng-2f6b1241-9ab3-47a2-bc23-789d59a3fd8f@palmerdabbelt-glaptop1>
Mime-Version: 1.0 (MHng)
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 8bit
Sender: linux-kernel-owner@vger.kernel.org
Precedence: bulk

On Tue, 14 Jul 2020 20:20:54 PDT (-0700), anshuman.khandual@arm.com wrote:
>
>
> On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
>> This allows the pgtable tests to be built.
>>
>> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
>> ---
>>
>> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>>
>> Both with and without the recent additions in
>> https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/
>
> That's great, thanks for testing.

Actually, looking at this I'm not sure it actually helps us any.  This changes
the behavior of two functions.  Pulling out the relevant sections, I see:

unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
        w -= (w >> 1) & 0x55555555;
        w =  (w & 0x33333333) + ((w >> 2) & 0x33333333);
        w =  (w + (w >> 4)) & 0x0f0f0f0f;
        return (w * 0x01010101) >> 24;
#else
        unsigned int res = w - ((w >> 1) & 0x55555555);
        res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
        res = (res + (res >> 4)) & 0x0F0F0F0F;
        res = res + (res >> 8);
        return (res + (res >> 16)) & 0x000000FF;
#endif
}

and

unsigned long memchr_inv(unsigned long value64)
{
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
        value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
        value64 *= 0x01010101;
        value64 |= value64 << 32;
#else
        value64 |= value64 << 8;
        value64 |= value64 << 16;
        value64 |= value64 << 32;
#endif
	return value64;
}

GCC optimizer the multiplication out of the first one:

__sw_hweight32:
	li	a4,1431654400
	srliw	a5,a0,1
	addi	a4,a4,1365
	and	a5,a5,a4
	subw	a0,a0,a5
	li	a5,858992640
	srliw	a4,a0,2
	addi	a5,a5,819
	and	a0,a5,a0
	and	a5,a5,a4
	addw	a5,a0,a5
	srliw	a0,a5,4
	addw	a0,a0,a5
	li	a5,252645376
	addi	a5,a5,-241
	and	a5,a5,a0
	srliw	a0,a5,8
	addw	a5,a0,a5
	srliw	a0,a5,16
	addw	a0,a0,a5
	andi	a0,a0,0xff
	ret

__sw_hweight32:
	li	a5,1431654400
	srliw	a4,a0,1
	addi	a5,a5,1365
	and	a5,a5,a4
	subw	a0,a0,a5
	li	a5,858992640
	srliw	a4,a0,2
	addi	a5,a5,819
	and	a0,a5,a0
	and	a5,a5,a4
	addw	a5,a0,a5
	srliw	a0,a5,4
	addw	a5,a0,a5
	li	a0,252645376
	addi	a0,a0,-241
	and	a5,a0,a5
	slliw	a0,a5,8
	addw	a0,a0,a5
	slliw	a5,a0,16
	addw	a0,a0,a5
	srliw	a0,a0,24
	ret

so that doesn't matter.  The second one is really a wash: 

memchr_inv:
	ld	a5,.LC0
	mul	a0,a0,a5
	ret
.rodata
.LC0:
	.dword	72340172838076673

vs

memchr_inv:
	slli	a5,a0,8
	or	a5,a5,a0
	slli	a0,a5,16
	or	a0,a0,a5
	slli	a5,a0,32
	or	a0,a5,a0
	ret

It's unlikely that load ends up relaxed, so it's going to be two instructions.
That means we have 4 cycles to forward the load and multiply, for a cache hit.
IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
imul as 10 cycles, but I remember it being more like 5 so that might just be an
architecture-inaccurate tuning in the generic pipeline model.  This is out of
the inner loop, so it's probably not all that important anyway.  The result
isn't used for a while so on a bigger machine it's probably worth picking the
smaller code path, but it seems like a very small thing to optimize for either
way.

I'm actually a bit surprised about this.  Do you have benchmarks that indicate
ARCH_HAS_FAST_MULTIPLIER is actually faster?  Otherwise I guess I'm going to
reject this, as it's really more
ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
ARCH_HAS_FAST_MULTIPLIER.