Let's rework that code to avoid large immediate values and convert some
64-bit variables to 32-bit ones when possible. This allows gcc to
produce smaller and better code. This even produces optimal code on
RISC-V.
Signed-off-by: Nicolas Pitre <[email protected]>
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index dc9726fdac..33358245b4 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -178,7 +178,8 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
uint32_t m_hi = m >> 32;
uint32_t n_lo = n;
uint32_t n_hi = n >> 32;
- uint64_t res, tmp;
+ uint64_t res;
+ uint32_t res_lo, res_hi, tmp;
if (!bias) {
res = ((uint64_t)m_lo * n_lo) >> 32;
@@ -187,8 +188,9 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
res = (m + (uint64_t)m_lo * n_lo) >> 32;
} else {
res = m + (uint64_t)m_lo * n_lo;
- tmp = (res < m) ? (1ULL << 32) : 0;
- res = (res >> 32) + tmp;
+ res_lo = res >> 32;
+ res_hi = (res_lo < m_hi);
+ res = res_lo | ((uint64_t)res_hi << 32);
}
if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
@@ -197,10 +199,12 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
res += (uint64_t)m_hi * n_lo;
res >>= 32;
} else {
- tmp = res += (uint64_t)m_lo * n_hi;
+ res += (uint64_t)m_lo * n_hi;
+ tmp = res >> 32;
res += (uint64_t)m_hi * n_lo;
- tmp = (res < tmp) ? (1ULL << 32) : 0;
- res = (res >> 32) + tmp;
+ res_lo = res >> 32;
+ res_hi = (res_lo < tmp);
+ res = res_lo | ((uint64_t)res_hi << 32);
}
res += (uint64_t)m_hi * n_hi;
Ping.
On Tue, 20 Aug 2019, Nicolas Pitre wrote:
> Let's rework that code to avoid large immediate values and convert some
> 64-bit variables to 32-bit ones when possible. This allows gcc to
> produce smaller and better code. This even produces optimal code on
> RISC-V.
>
> Signed-off-by: Nicolas Pitre <[email protected]>
>
> diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
> index dc9726fdac..33358245b4 100644
> --- a/include/asm-generic/div64.h
> +++ b/include/asm-generic/div64.h
> @@ -178,7 +178,8 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
> uint32_t m_hi = m >> 32;
> uint32_t n_lo = n;
> uint32_t n_hi = n >> 32;
> - uint64_t res, tmp;
> + uint64_t res;
> + uint32_t res_lo, res_hi, tmp;
>
> if (!bias) {
> res = ((uint64_t)m_lo * n_lo) >> 32;
> @@ -187,8 +188,9 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
> res = (m + (uint64_t)m_lo * n_lo) >> 32;
> } else {
> res = m + (uint64_t)m_lo * n_lo;
> - tmp = (res < m) ? (1ULL << 32) : 0;
> - res = (res >> 32) + tmp;
> + res_lo = res >> 32;
> + res_hi = (res_lo < m_hi);
> + res = res_lo | ((uint64_t)res_hi << 32);
> }
>
> if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
> @@ -197,10 +199,12 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
> res += (uint64_t)m_hi * n_lo;
> res >>= 32;
> } else {
> - tmp = res += (uint64_t)m_lo * n_hi;
> + res += (uint64_t)m_lo * n_hi;
> + tmp = res >> 32;
> res += (uint64_t)m_hi * n_lo;
> - tmp = (res < tmp) ? (1ULL << 32) : 0;
> - res = (res >> 32) + tmp;
> + res_lo = res >> 32;
> + res_hi = (res_lo < tmp);
> + res = res_lo | ((uint64_t)res_hi << 32);
> }
>
> res += (uint64_t)m_hi * n_hi;
>
On Wed, Aug 21, 2019 at 5:05 AM Nicolas Pitre <[email protected]> wrote:
>
> Let's rework that code to avoid large immediate values and convert some
> 64-bit variables to 32-bit ones when possible. This allows gcc to
> produce smaller and better code. This even produces optimal code on
> RISC-V.
>
> Signed-off-by: Nicolas Pitre <[email protected]>
>
Applied, thanks!
Arnd