LinuxLists.cc - [PATCH] Remove some divide instructions

2004-10-27 16:19:43

Subject: [PATCH] Remove some divide instructions

div64.patch :

Get rid of 64-bit constant divide by one. This appears to be a common case
for HZ == USER_HZ. I tested the patch on older 2.6 kernels and was able to
produce some harmless warnings (statement has no effect), but it builds clean
for i386 with a 2.6.10 kernel. I tested the generic asm inline by extracting
to gcc, but I have not tested any other platforms.

Doubtful, but if this breaks your build for some other platform, send me mail.

Zach Amsden
[email protected]

Attachments:

div64.patch (6.07 kB)
README.div64 (484.00 B)
Download all attachments

2004-10-27 16:30:00

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 27 Oct 2004, Zachary Amsden wrote:
>
> I noticed several 64-bit divides for HZ/USER_HZ, and also the fact that
> HZ == USER_HZ on many architectures (or if you play with scaling it ;).
> Since do_div is macroized to optimized assembler on many platforms, we
> emit long divides for divide by one. This could be extended further to
> recognize other power of two divides, but I don't think the complexity
> of the macros would be justified. I also didn't feel it was worthwhile
> to optimize this for non-constant divides; if you feel otherwise, please
> extend.

Can you test out the trivial extension, which is to allow any power-of-two
constant, and just leave it as a divide in those cases (and let the
compiler turn it into a 64-bit shift)?

It's very easy to test for powers of two: !((x) & ((x)-1)) does it for
everything but zero, and zero in this case is an error anyway.

Linus

2004-10-27 21:08:01

by Zachary Amsden

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

diff -ru linux-2.6.10-rc1/include/asm-arm/div64.h linux-2.6.10-rc1-nsz/include/asm-arm/div64.h
--- linux-2.6.10-rc1/include/asm-arm/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-arm/div64.h 2004-10-27 10:24:25.000000000 -0700
@@ -27,22 +27,27 @@
#define __xh "r1"
#endif

-#define do_div(n,base) \
-({ \
- register unsigned int __base asm("r4") = base; \
- register unsigned long long __n asm("r0") = n; \
- register unsigned long long __res asm("r2"); \
- register unsigned int __rem asm(__xh); \
- asm( __asmeq("%0", __xh) \
- __asmeq("%1", "r2") \
- __asmeq("%2", "r0") \
- __asmeq("%3", "r4") \
- "bl __do_div64" \
- : "=r" (__rem), "=r" (__res) \
- : "r" (__n), "r" (__base) \
- : "ip", "lr", "cc"); \
- n = __res; \
- __rem; \
+#define do_div(n,base) \
+({ \
+ register unsigned long long __n asm("r0") = n; \
+ register unsigned long long __res asm("r2"); \
+ register unsigned int __rem asm(__xh); \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = __n % (base); \
+ __res = __n / (base); \
+ } else { \
+ register unsigned int __base asm("r4") = base; \
+ asm( __asmeq("%0", __xh) \
+ __asmeq("%1", "r2") \
+ __asmeq("%2", "r0") \
+ __asmeq("%3", "r4") \
+ "bl __do_div64" \
+ : "=r" (__rem), "=r" (__res) \
+ : "r" (__n), "r" (__base) \
+ : "ip", "lr", "cc"); \
+ } \
+ n = __res; \
+ __rem; \
})

#endif
diff -ru linux-2.6.10-rc1/include/asm-generic/div64.h linux-2.6.10-rc1-nsz/include/asm-generic/div64.h
--- linux-2.6.10-rc1/include/asm-generic/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-generic/div64.h 2004-10-27 10:24:06.000000000 -0700
@@ -22,12 +22,17 @@

#if BITS_PER_LONG == 64

-# define do_div(n,base) ({ \
- uint32_t __base = (base); \
- uint32_t __rem; \
- __rem = ((uint64_t)(n)) % __base; \
- (n) = ((uint64_t)(n)) / __base; \
- __rem; \
+# define do_div(n,base) ({ \
+ uint32_t __rem; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ uint32_t __base = (base); \
+ __rem = ((uint64_t)(n)) % __base; \
+ (n) = ((uint64_t)(n)) / __base; \
+ } \
+ __rem; \
})

#elif BITS_PER_LONG == 32
@@ -37,16 +42,21 @@
/* The unnecessary pointer compare is there
* to check for type safety (n must be 64bit)
*/
-# define do_div(n,base) ({ \
- uint32_t __base = (base); \
- uint32_t __rem; \
- (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
- if (likely(((n) >> 32) == 0)) { \
- __rem = (uint32_t)(n) % __base; \
- (n) = (uint32_t)(n) / __base; \
- } else \
- __rem = __div64_32(&(n), __base); \
- __rem; \
+# define do_div(n,base) ({ \
+ uint32_t __rem; \
+ (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ uint32_t __base = (base); \
+ if (likely(((n) >> 32) == 0)) { \
+ __rem = (uint32_t)(n) % __base; \
+ (n) = (uint32_t)(n) / __base; \
+ } else \
+ __rem = __div64_32(&(n), __base); \
+ } \
+ __rem; \
})

#else /* BITS_PER_LONG == ?? */
diff -ru linux-2.6.10-rc1/include/asm-i386/div64.h linux-2.6.10-rc1-nsz/include/asm-i386/div64.h
--- linux-2.6.10-rc1/include/asm-i386/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-i386/div64.h 2004-10-27 10:15:04.000000000 -0700
@@ -14,16 +14,23 @@
* convention" on x86.
*/
#define do_div(n,base) ({ \
- unsigned long __upper, __low, __high, __mod, __base; \
- __base = (base); \
- asm("":"=a" (__low), "=d" (__high):"A" (n)); \
- __upper = __high; \
- if (__high) { \
- __upper = __high % (__base); \
- __high = __high / (__base); \
+ unsigned long __mod; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __mod = ((uint64_t)(n)) % base; \
+ (n) = ((uint64_t)(n)) / base; \
+ } else { \
+ unsigned long __upper, __low, __high, __base; \
+ __base = (base); \
+ asm("":"=a" (__low), "=d" (__high):"A" (n)); \
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2": "=a" (__low), "=d" (__mod) : \
+ "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("":"=A" (n):"a" (__low),"d" (__high)); \
} \
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
- asm("":"=A" (n):"a" (__low),"d" (__high)); \
__mod; \
})

diff -ru linux-2.6.10-rc1/include/asm-m32r/div64.h linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h
--- linux-2.6.10-rc1/include/asm-m32r/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m32r/div64.h 2004-10-27 10:23:37.000000000 -0700
@@ -11,28 +11,33 @@
* n = n / base;
* return value = n % base;
*/
-#define do_div(n, base) \
-({ \
- unsigned long _res, _high, _mid, _low; \
- \
- _low = (n) & 0xffffffffUL; \
- _high = (n) >> 32; \
- if (_high) { \
- _mid = (_high % (unsigned long)(base)) << 16; \
- _high = _high / (unsigned long)(base); \
- _mid += _low >> 16; \
- _low &= 0x0000ffffUL; \
- _low += (_mid % (unsigned long)(base)) << 16; \
- _mid = _mid / (unsigned long)(base); \
- _res = _low % (unsigned long)(base); \
- _low = _low / (unsigned long)(base); \
- n = _low + ((long long)_mid << 16) + \
- ((long long)_high << 32); \
- } else { \
- _res = _low % (unsigned long)(base); \
- n = (_low / (unsigned long)(base)); \
- } \
- _res; \
+#define do_div(n, base) \
+({ \
+ unsigned long _res; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ _res = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ unsigned long _high, _mid, _low; \
+ _low = (n) & 0xffffffffUL; \
+ _high = (n) >> 32; \
+ if (_high) { \
+ _mid = (_high % (unsigned long)(base)) << 16; \
+ _high = _high / (unsigned long)(base); \
+ _mid += _low >> 16; \
+ _low &= 0x0000ffffUL; \
+ _low += (_mid % (unsigned long)(base)) << 16; \
+ _mid = _mid / (unsigned long)(base); \
+ _res = _low % (unsigned long)(base); \
+ _low = _low / (unsigned long)(base); \
+ n = _low + ((long long)_mid << 16) + \
+ ((long long)_high << 32); \
+ } else { \
+ _res = _low % (unsigned long)(base); \
+ n = (_low / (unsigned long)(base)); \
+ } \
+ } \
+ _res; \
})

#endif /* _ASM_M32R_DIV64 */
diff -ru linux-2.6.10-rc1/include/asm-m68k/div64.h linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h
--- linux-2.6.10-rc1/include/asm-m68k/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-m68k/div64.h 2004-10-27 10:23:23.000000000 -0700
@@ -3,24 +3,30 @@

/* n = n / base; return rem; */

-#define do_div(n, base) ({ \
- union { \
- unsigned long n32[2]; \
- unsigned long long n64; \
- } __n; \
- unsigned long __rem, __upper; \
- \
- __n.n64 = (n); \
- if ((__upper = __n.n32[0])) { \
- asm ("divul.l %2,%1:%0" \
- : "=d" (__n.n32[0]), "=d" (__upper) \
- : "d" (base), "0" (__n.n32[0])); \
- } \
- asm ("divu.l %2,%1:%0" \
- : "=d" (__n.n32[1]), "=d" (__rem) \
- : "d" (base), "1" (__upper), "0" (__n.n32[1])); \
- (n) = __n.n64; \
- __rem; \
-})
+#define do_div(n, base) ({ \
+ unsigned long __rem; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ union { \
+ unsigned long n32[2]; \
+ unsigned long long n64; \
+ } __n; \
+ unsigned long __upper; \
+ \
+ __n.n64 = (n); \
+ if ((__upper = __n.n32[0])) { \
+ asm ("divul.l %2,%1:%0" \
+ : "=d" (__n.n32[0]), "=d" (__upper) \
+ : "d" (base), "0" (__n.n32[0])); \
+ } \
+ asm ("divu.l %2,%1:%0" \
+ : "=d" (__n.n32[1]), "=d" (__rem) \
+ : "d" (base), "1" (__upper), "0" (__n.n32[1])); \
+ (n) = __n.n64; \
+ } \
+ __rem; \
+}))

#endif /* _M68K_DIV64_H */
diff -ru linux-2.6.10-rc1/include/asm-mips/div64.h linux-2.6.10-rc1-nsz/include/asm-mips/div64.h
--- linux-2.6.10-rc1/include/asm-mips/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-mips/div64.h 2004-10-27 10:34:01.000000000 -0700
@@ -52,27 +52,32 @@
__mod; })

#define do_div(n, base) ({ \
+ unsigned long long __div; \
unsigned long long __quot; \
unsigned long __mod; \
- unsigned long long __div; \
- unsigned long __upper, __low, __high, __base; \
\
__div = (n); \
- __base = (base); \
- \
- __high = __div >> 32; \
- __low = __div; \
- __upper = __high; \
- \
- if (__high) \
- __asm__("divu $0, %z2, %z3" \
- : "=h" (__upper), "=l" (__high) \
- : "Jr" (__high), "Jr" (__base)); \
- \
- __mod = do_div64_32(__low, __upper, __low, __base); \
- \
- __quot = __high; \
- __quot = __quot << 32 | __low; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __mod = __div % (base); \
+ __quot = __div / (base); \
+ } else { \
+ unsigned long __upper, __low, __high, __base; \
+ \
+ __base = (base); \
+ __high = __div >> 32; \
+ __low = __div; \
+ __upper = __high; \
+ \
+ if (__high) \
+ __asm__("divu $0, %z2, %z3" \
+ : "=h" (__upper), "=l" (__high) \
+ : "Jr" (__high), "Jr" (__base)); \
+ \
+ __mod = do_div64_32(__low, __upper, __low, __base); \
+ \
+ __quot = __high; \
+ __quot = __quot << 32 | __low; \
+ } \
(n) = __quot; \
__mod; })
#endif /* (_MIPS_SZLONG == 32) */
@@ -104,18 +109,22 @@
* Hey, we're already 64-bit, no
* need to play games..
*/
-#define do_div(n, base) ({ \
+#define do_div(n, base) ( \
+ unsigned long __div; \
unsigned long __quot; \
unsigned int __mod; \
- unsigned long __div; \
- unsigned int __base; \
\
__div = (n); \
- __base = (base); \
- \
- __mod = __div % __base; \
- __quot = __div / __base; \
- \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __mod = __div % (base); \
+ __quot = __div / (base); \
+ } else { \
+ unsigned int __base; \
+ \
+ __base = (base); \
+ __mod = __div % __base; \
+ __quot = __div / __base; \
+ } \
(n) = __quot; \
__mod; })

diff -ru linux-2.6.10-rc1/include/asm-s390/div64.h linux-2.6.10-rc1-nsz/include/asm-s390/div64.h
--- linux-2.6.10-rc1/include/asm-s390/div64.h 2004-10-27 10:42:32.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-s390/div64.h 2004-10-27 10:43:06.000000000 -0700
@@ -4,42 +4,47 @@
#ifndef __s390x__

/* for do_div "base" needs to be smaller than 2^31-1 */
-#define do_div(n, base) ( \
- unsigned long long __n = (n); \
- unsigned long __r; \
- \
- asm (" slr 0,0\n" \
- " l 1,%1\n" \
- " srdl 0,1\n" \
- " dr 0,%2\n" \
- " alr 1,1\n" \
- " alr 0,0\n" \
- " lhi 2,1\n" \
- " n 2,%1\n" \
- " alr 0,2\n" \
- " clr 0,%2\n" \
- " jl 0f\n" \
- " slr 0,%2\n" \
- " ahi 1,1\n" \
- "0: st 1,%1\n" \
- " l 1,4+%1\n" \
- " srdl 0,1\n" \
- " dr 0,%2\n" \
- " alr 1,1\n" \
- " alr 0,0\n" \
- " lhi 2,1\n" \
- " n 2,4+%1\n" \
- " alr 0,2\n" \
- " clr 0,%2\n" \
- " jl 1f\n" \
- " slr 0,%2\n" \
- " ahi 1,1\n" \
- "1: st 1,4+%1\n" \
- " lr %0,0" \
- : "=d" (__r), "=m" (__n) \
- : "d" (base), "m" (__n) : "0", "1", "2", "cc" ); \
- (n) = (__n); \
- __r; \
+#define do_div(n, base) ({ \
+ unsigned long long __n = (n); \
+ unsigned long __r; \
+ \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __r = __n % (base); \
+ __n = __n / (base); \
+ } else { \
+ asm (" slr 0,0\n" \
+ " l 1,%1\n" \
+ " srdl 0,1\n" \
+ " dr 0,%2\n" \
+ " alr 1,1\n" \
+ " alr 0,0\n" \
+ " lhi 2,1\n" \
+ " n 2,%1\n" \
+ " alr 0,2\n" \
+ " clr 0,%2\n" \
+ " jl 0f\n" \
+ " slr 0,%2\n" \
+ " ahi 1,1\n" \
+ "0: st 1,%1\n" \
+ " l 1,4+%1\n" \
+ " srdl 0,1\n" \
+ " dr 0,%2\n" \
+ " alr 1,1\n" \
+ " alr 0,0\n" \
+ " lhi 2,1\n" \
+ " n 2,4+%1\n" \
+ " alr 0,2\n" \
+ " clr 0,%2\n" \
+ " jl 1f\n" \
+ " slr 0,%2\n" \
+ " ahi 1,1\n" \
+ "1: st 1,4+%1\n" \
+ " lr %0,0" \
+ : "=d" (__r), "=m" (__n) \
+ : "d" (base), "m" (__n) : "0", "1", "2", "cc" ); \
+ } \
+ (n) = (__n); \
+ __r; \
})

#else /* __s390x__ */

Attachments:

div64-2.patch (13.07 kB)

2004-10-27 22:36:19

by Thayne Harbaugh

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 2004-10-27 at 09:28 -0700, Linus Torvalds wrote:

> It's very easy to test for powers of two: !((x) & ((x)-1)) does it for
> everything but zero, and zero in this case is an error anyway.

Of course 0 is not a power of two - nor a power of any other number
(although n**(-oo) where |n|>1 and n**(oo) where |n|<1 are close).

8)

--
Thayne Harbaugh
Linux Networx

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2004-10-27 21:32:23

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 27 Oct 2004, Zachary Amsden wrote:
>
> Passes several tests (code looks sane, assembler looks sane, boots &
> runs fine) on i386. Apologies if I unlikely broke any other
> architectures, but it's quite difficult to test them all.

I'd suggest _only_ changing the i386 version.

For example, your asm-generic changes looks senseless, since they only
make the macros more complex, without actually changing anything. And the
other architectures may want to do other things, since right now at least
some of them use things like fixed hardware registers etc which is not
necessarily appropriate for the non-asm case...

That way you'd also only modify the architecture that you can actually
verify..

Linus

2004-10-27 22:45:40

by Zachary Amsden

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

diff -ru linux-2.6.10-rc1/include/asm-generic/div64.h linux-2.6.10-rc1-nsz/include/asm-generic/div64.h
--- linux-2.6.10-rc1/include/asm-generic/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-generic/div64.h 2004-10-27 10:24:06.000000000 -0700
@@ -22,12 +22,17 @@

#if BITS_PER_LONG == 64

-# define do_div(n,base) ({ \
- uint32_t __base = (base); \
- uint32_t __rem; \
- __rem = ((uint64_t)(n)) % __base; \
- (n) = ((uint64_t)(n)) / __base; \
- __rem; \
+# define do_div(n,base) ({ \
+ uint32_t __rem; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ uint32_t __base = (base); \
+ __rem = ((uint64_t)(n)) % __base; \
+ (n) = ((uint64_t)(n)) / __base; \
+ } \
+ __rem; \
})

#elif BITS_PER_LONG == 32
@@ -37,16 +42,21 @@
/* The unnecessary pointer compare is there
* to check for type safety (n must be 64bit)
*/
-# define do_div(n,base) ({ \
- uint32_t __base = (base); \
- uint32_t __rem; \
- (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
- if (likely(((n) >> 32) == 0)) { \
- __rem = (uint32_t)(n) % __base; \
- (n) = (uint32_t)(n) / __base; \
- } else \
- __rem = __div64_32(&(n), __base); \
- __rem; \
+# define do_div(n,base) ({ \
+ uint32_t __rem; \
+ (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __rem = ((uint64_t)(n)) % (base); \
+ (n) = ((uint64_t)(n)) / (base); \
+ } else { \
+ uint32_t __base = (base); \
+ if (likely(((n) >> 32) == 0)) { \
+ __rem = (uint32_t)(n) % __base; \
+ (n) = (uint32_t)(n) / __base; \
+ } else \
+ __rem = __div64_32(&(n), __base); \
+ } \
+ __rem; \
})

#else /* BITS_PER_LONG == ?? */
diff -ru linux-2.6.10-rc1/include/asm-i386/div64.h linux-2.6.10-rc1-nsz/include/asm-i386/div64.h
--- linux-2.6.10-rc1/include/asm-i386/div64.h 2004-10-27 10:41:40.000000000 -0700
+++ linux-2.6.10-rc1-nsz/include/asm-i386/div64.h 2004-10-27 10:15:04.000000000 -0700
@@ -14,16 +14,23 @@
* convention" on x86.
*/
#define do_div(n,base) ({ \
- unsigned long __upper, __low, __high, __mod, __base; \
- __base = (base); \
- asm("":"=a" (__low), "=d" (__high):"A" (n)); \
- __upper = __high; \
- if (__high) { \
- __upper = __high % (__base); \
- __high = __high / (__base); \
+ unsigned long __mod; \
+ if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
+ __mod = ((uint64_t)(n)) % base; \
+ (n) = ((uint64_t)(n)) / base; \
+ } else { \
+ unsigned long __upper, __low, __high, __base; \
+ __base = (base); \
+ asm("":"=a" (__low), "=d" (__high):"A" (n)); \
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2": "=a" (__low), "=d" (__mod) : \
+ "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("":"=A" (n):"a" (__low),"d" (__high)); \
} \
- asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
- asm("":"=A" (n):"a" (__low),"d" (__high)); \
__mod; \
})

Attachments:

div64-3.patch (3.15 kB)

2004-10-28 00:14:58

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 27 Oct 2004, Zachary Amsden wrote:
>
> Backed out everything but i386 and generic. For the generic version, I
> compiled and tested this code outside of the kernel. Actually, I found
> that at least for my tool chain, the generic version
>
> +# define do_div(n,base) ({ \
> + uint32_t __rem; \
> + if (__builtin_constant_p(base) && !((base) & ((base)-1))) { \
> + __rem = ((uint64_t)(n)) % (base); \
> + (n) = ((uint64_t)(n)) / (base); \
> + } else { \
> + uint32_t __base = (base); \
> + __rem = ((uint64_t)(n)) % __base; \
> + (n) = ((uint64_t)(n)) / __base; \
> + } \
> + __rem;
>
> Does indeed generate different code for the constant case - without it,
> due to the assignment to __base, the shift / mask optimization does not
> take place.

Oh, damn. That's a separate issue, apparently, and there you just use
"__builtin_constant_p()" as a way to check that there are no side effects
on "base".

Might as well drop the check for the value, since it doesn't matter.

Alternatively, we could just document the fact that neither "base" nor "n"
are normal arguments to a function. After all, we already _do_ change "n",
and the strange calling convention is already documented as nothing but a
sick hack to make architectures able to use inline assembly efficiently.

I could add a sparse check for "no side effects", if anybody cares (so
that you could do

__builtin_warning(
!__builtin_nosideeffects(base),
"expression has side effects");

in macros like these.. Sparse already has the logic internally..

Linus

2004-10-28 00:52:38

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 27 Oct 2004, Linus Torvalds wrote:
>
> I could add a sparse check for "no side effects", if anybody cares (so
> that you could do
>
> __builtin_warning(
> !__builtin_nosideeffects(base),
> "expression has side effects");
>
> in macros like these.. Sparse already has the logic internally..

Done. Except I called it __builtin_safe_p().

The kernel sources already know about "__builtin_warning()" (and
pre-process it away on gcc), so if you have a new sparse setup (as of two
minutes ago ;), you can use this thing to check that arguments to macros
do not have side effects.

Useful? You be the judge. But it was just a couple of lines in sparse, and
doing so also made it obvious how to clean up __builtin_constant_p() a lot
at the same time by just re-organizing things a bit.

My inliner and statement simplificator isn't perfect, so inline functions
sadly are not considered constant (or safe) even if they _do_ end up
returning a constant value (or be safe internally).

Linus

2004-10-27 18:22:46

by Zachary Amsden

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

Attachments:

div64-2.patch (13.07 kB)

2004-10-28 01:02:58

by Maciej W. Rozycki

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Wed, 27 Oct 2004, Linus Torvalds wrote:

> > Does indeed generate different code for the constant case - without it,
> > due to the assignment to __base, the shift / mask optimization does not
> > take place.
>
> Oh, damn. That's a separate issue, apparently, and there you just use
> "__builtin_constant_p()" as a way to check that there are no side effects
> on "base".

That has to be a deficiency in that specific version of compiler. For me
it just works as long as __base is const:

$ cat do_div.c
#include <stdint.h>

#define do_div(n, base) ({ \
const uint32_t __base = (base); \
uint32_t __rem; \
__rem = ((uint64_t)(n)) % __base; \
(n) = ((uint64_t)(n)) / __base; \
__rem; \
})

uint32_t div16(uint64_t *n)
{
return do_div(*n, 16);
}
$ gcc -g -O2 -fomit-frame-pointer -c do_div.c
$ objdump -d do_div.o

do_div.o: file format elf32-i386

Disassembly of section .text:

00000000 <div16>:
0: 56 push %esi
1: 53 push %ebx
2: 8b 74 24 0c mov 0xc(%esp),%esi
6: 8b 0e mov (%esi),%ecx
8: 8b 5e 04 mov 0x4(%esi),%ebx
b: 89 c8 mov %ecx,%eax
d: 0f ac d9 04 shrd $0x4,%ebx,%ecx
11: c1 eb 04 shr $0x4,%ebx
14: 89 0e mov %ecx,(%esi)
16: 89 5e 04 mov %ebx,0x4(%esi)
19: 5b pop %ebx
1a: 83 e0 0f and $0xf,%eax
1d: 5e pop %esi
1e: c3 ret
$ gcc --version
gcc (GCC) 3.5.0 20040322 (experimental)
[...]

I guess anything not older is expected to work.

Maciej

2004-10-29 00:54:17

by Zachary Amsden

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

Tried this. I found a problem -- ide-cd.c calls a function to compute
base, which was caught by sparse. This is easy enough to workaround,
but my adventures went further than expected..

I tested all powers of two and found gcc doesn't like to perform the
optimization for 0x80000000 as a divisor. Ok, worked around that. Now
Everything appears to work great, until I started using modules:

MODPOST
*** Warning: "__udivdi3" [drivers/scsi/ipr.ko] undefined!
*** Warning: "__umoddi3" [drivers/scsi/ipr.ko] undefined!
*** Warning: "__udivdi3" [drivers/scsi/dpt_i2o.ko] undefined!
*** Warning: "__umoddi3" [drivers/scsi/dpt_i2o.ko] undefined!
*** Warning: "__udivdi3" [drivers/base/firmware_class.ko] undefined!
*** Warning: "__umoddi3" [drivers/base/firmware_class.ko] undefined!

That doesn't look good. Trying to load the modules confirms that
__uxxxdi3 is missing. How did this happen? If you look at do_div, it
is clear that __udivdi3 should not be needed, since we will always hit
the optimization path. Nevertheless, gcc inserts an extraneous ".globl
__udivdi3" in all output to the assembler from .c files which include
asm/div64.h, even if the do_div function is never directly referenced
and no instances of it appear in the assembler output (libcrc32c.c is
the easiest to verify). Apparently, this happens somewhere before the
optimization phase, and the external reference never gets dropped after
that. Since div64.h is included from sched.h, that happens to be quite
a few files.

#define do_div(n,base) ({ \
unsigned long __mod; \
if (unlikely(__builtin_constant_p(base) && !((base) &
((base)-1)) && \
(long)(base) > 0)) { \
__mod = ((uint64_t)(n)) % ((unsigned long)(base)); \
(n) = ((uint64_t)(n)) / ((unsigned long)(base)); \
} else { \

The kernel proper is ok - since the optimization is done correctly and
udivdi3 is never actually referenced, it gets dropped during the link
phase. Modules are not - the unresolved symbol stays there.

This leaves several options:

1) Forget the optimization altogether
2) Go back to the (base == 1) check
3) In the module post phase, strip extraneous unused external references
from modules
4) Fixup module loading to work around the problem
5) Do an enumeration case for each possible constant divisor
6) Upgrade my silly old compiler and tools

This seems like a lot of work for a trivial optimization; for i386,
perhaps #2 is the most appropriate - with a sufficiently new GCC, this
optimization should be automatic for all architectures not hardcoding
do_div as inline assembler.

Seems to have come full circle - the trivial extension turns out to have
non-trivial side effects. If only GCC were as easily extensible as
sparse! A __builtin_highest_one_bit() function would make it possible
to use inline assembler without degenerating to individual cases for
each bit.

Zachary Amsden
[email protected]

Linus Torvalds wrote:

>On Wed, 27 Oct 2004, Linus Torvalds wrote:
>
>
>>I could add a sparse check for "no side effects", if anybody cares (so
>>that you could do
>>
>> __builtin_warning(
>> !__builtin_nosideeffects(base),
>> "expression has side effects");
>>
>>in macros like these.. Sparse already has the logic internally..
>>
>>
>
>Done. Except I called it __builtin_safe_p().
>
>The kernel sources already know about "__builtin_warning()" (and
>pre-process it away on gcc), so if you have a new sparse setup (as of two
>minutes ago ;), you can use this thing to check that arguments to macros
>do not have side effects.
>
>Useful? You be the judge. But it was just a couple of lines in sparse, and
>doing so also made it obvious how to clean up __builtin_constant_p() a lot
>at the same time by just re-organizing things a bit.
>
>My inliner and statement simplificator isn't perfect, so inline functions
>sadly are not considered constant (or safe) even if they _do_ end up
>returning a constant value (or be safe internally).
>
> Linus
>
>

2004-10-29 04:52:20

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Thu, 28 Oct 2004, Zachary Amsden wrote:
>
> This leaves several options:
>
> 1) Forget the optimization altogether
> 2) Go back to the (base == 1) check

Ok, I think I led you on a merry goose-chase, and the "base == 1" check
was the only one worth bothering with after all. Sorry about that.

> This seems like a lot of work for a trivial optimization; for i386,
> perhaps #2 is the most appropriate - with a sufficiently new GCC, this
> optimization should be automatic for all architectures not hardcoding
> do_div as inline assembler.

The do_div() optimization is a trivial one, and one that gcc should
definitely have recognized. It's not like a compiler cannot see that you
have a 64 / 32 divide, and realize that it's cheaper than a full 64 / 64
divide.

But hey, even if the gcc people finally did that optimization today, it
would take a few years before we didn't support old compilers any more,
so.

> Seems to have come full circle - the trivial extension turns out to have
> non-trivial side effects. If only GCC were as easily extensible as
> sparse! A __builtin_highest_one_bit() function would make it possible
> to use inline assembler without degenerating to individual cases for
> each bit.

Yes. There are tons of places where we'd love to have a constant
compile-time "log2()" function. And yes, I could do it in sparse in about
ten more lines of code, but..

I guess you could do it with a lot of tests... Something like this should
do it

#define __constant_log2(y) ((y==1) ? 0 : \
(y==2) ? 1 : \
(y==4) ? 2 : \
(y==8) ? 3 : -1) /* We could go on ... */

#define do_div(x,y) ({ \
unsigned long __mod; \
int __log2; \
if (__builtin_constant_p(y) && \
!((y) & ((y)-1)) && \
(__log2 = __constant_log2((y))) >= 0) { \
mod = x & ((y)-1); \
(x) >>= __log2; \
} else { \
.. inline asm case .. \
} \
__mod; })

which looks like it should work, but it's getting so ugly that I suspect I
should be committed for even thinking about it.

(And no, I didn't test the above. It is all trivially optimizable by a
compiler, and I can't see how gcc could _fail_ to get it right, but hey, I
thought the previous thing would work too, so I'm clearly not competent to
make that judgement... ;)

Linus

2004-10-29 19:43:26

by Geert Uytterhoeven

[permalink] [raw]

Subject: Re: [PATCH] Remove some divide instructions

On Thu, 28 Oct 2004, Linus Torvalds wrote:
> On Thu, 28 Oct 2004, Zachary Amsden wrote:
> #define __constant_log2(y) ((y==1) ? 0 : \
> (y==2) ? 1 : \
> (y==4) ? 2 : \
> (y==8) ? 3 : -1) /* We could go on ... */
>
> #define do_div(x,y) ({ \
> unsigned long __mod; \
> int __log2; \
> if (__builtin_constant_p(y) && \
> !((y) & ((y)-1)) && \
> (__log2 = __constant_log2((y))) >= 0) { \
> mod = x & ((y)-1); \
> (x) >>= __log2; \
> } else { \
> .. inline asm case .. \
> } \
> __mod; })
>
> which looks like it should work, but it's getting so ugly that I suspect I
> should be committed for even thinking about it.
>
> (And no, I didn't test the above. It is all trivially optimizable by a
> compiler, and I can't see how gcc could _fail_ to get it right, but hey, I
> thought the previous thing would work too, so I'm clearly not competent to
> make that judgement... ;)

Perhaps this is why you see in many header files code like this:

static inline xxx_const_case(...) { ... }
static inline xxx_nonconst_case(...) { ... }

#define xxx(...) __builtin_constant_p(...) ? xxx_const_case(...) \
: xxx_nonconst_case(...)

i.e. gcc is better in optimizing away calls to non-called functions?

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds