2016-04-27 08:07:39

by Zhaoxiu Zeng

[permalink] [raw]
Subject: [patch V2] lib: GCD: add binary GCD algorithm

From: Zhaoxiu Zeng <[email protected]>

Because some architectures (alpha, armv6, etc.) don't provide hardware division,
the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
it replaces division with arithmetic shifts, comparisons, and subtraction.

I have compiled successfully with x86_64_defconfig and i386_defconfig.

I use the following code to test:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

#define swap(a, b) \
do { \
a ^= b; \
b ^= a; \
a ^= b; \
} while (0)

unsigned long gcd0(unsigned long a, unsigned long b)
{
unsigned long r;

if (a < b) {
swap(a, b);
}

if (b == 0)
return a;

while ((r = a % b) != 0) {
a = b;
b = r;
}

return b;
}

unsigned long gcd1(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);
for (;;) {

a >>= __builtin_ctzl(a);
if (a == b)
break;

if (a < b)
swap(a, b);
a -= b;
}

b <<= __builtin_ctzl(r);
return b;
}

unsigned long gcd2(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r ^= (r - 1);

while (!(b & r))
b >>= 1;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == b)
break;

if (a < b)
swap(a, b);

a -= b;
a >>= 1;
if (a & r)
a += b;
}

return b;
}

static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
gcd0, gcd1, gcd2,
};

#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))
#define TEST_TIMES 100
static unsigned long res[TEST_ENTRIES][TEST_TIMES];

#define TIME_T struct timespec

static inline struct timespec read_time(void)
{
struct timespec time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
return time;
}

static uint64_t diff_time(struct timespec start, struct timespec end)
{
struct timespec temp;

if ((end.tv_nsec - start.tv_nsec) < 0) {
temp.tv_sec = end.tv_sec - start.tv_sec - 1;
temp.tv_nsec = 1000000000UL + end.tv_nsec - start.tv_nsec;
} else {
temp.tv_sec = end.tv_sec - start.tv_sec;
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}

return temp.tv_sec * 1000000000UL + temp.tv_nsec;
}

static inline unsigned long get_rand()
{
if (sizeof(long) == 8)
return (unsigned long)rand() << 32 | rand();
else
return rand();
}

int main(int argc, char **argv)
{
unsigned int seed = time(0);
unsigned int i, j;
TIME_T start, end;

for (i = 0; i < TEST_ENTRIES; i++) {
srand(seed);
start = read_time();
for (j = 0; j < TEST_TIMES; j++)
res[i][j] = gcd_func[i](get_rand(), get_rand());
end = read_time();
printf("gcd%d: elapsed %lld\n", i, diff_time(start, end));
sleep(1);
}

for (j = 0; j < TEST_TIMES; j++) {
for (i = 1; i < TEST_ENTRIES; i++) {
if (res[i][j] != res[0][j])
break;
}
if (i < TEST_ENTRIES) {
fprintf(stderr, "Error: ");
for (i = 0; i < TEST_ENTRIES; i++)
fprintf(stderr, "res%d %ld%s", i, res[i][j], i < TEST_ENTRIES - 1 ? ", " : "\n");
}
}

return 0;
}

Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 92281
gcd1: elapsed 55005
gcd2: elapsed 91088
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 115546
gcd1: elapsed 55928
gcd2: elapsed 91938
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91189
gcd1: elapsed 55493
gcd2: elapsed 90078
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 157364
gcd1: elapsed 55204
gcd2: elapsed 90058
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
gcd0: elapsed 91667
gcd1: elapsed 54641
gcd2: elapsed 91364

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <[email protected]>
---
lib/gcd.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 72 insertions(+), 9 deletions(-)

diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..460b414 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,19 +2,82 @@
#include <linux/gcd.h>
#include <linux/export.h>

-/* Greatest common divisor */
+/*
+ * use __ffs if the CPU has efficient __ffs
+ */
+#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \
+ defined(CONFIG_ARC) || \
+ (defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \
+ defined(CONFIG_AVR32) || \
+ defined(CONFIG_BLACKFIN) || \
+ defined(CONFIG_C6X) || \
+ defined(CONFIG_CRIS) || \
+ defined(CONFIG_FRV) || \
+ defined(CONFIG_HEXAGON) || \
+ defined(CONFIG_IA64) || \
+ (defined(CONFIG_M68K) && \
+ (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \
+ ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
+ !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \
+ defined(CONFIG_MN10300) || \
+ defined(CONFIG_OPENRISC) || \
+ defined(CONFIG_POWERPC) || \
+ defined(CONFIG_S390) || \
+ defined(CONFIG_TILE) || \
+ defined(CONFIG_UNICORE32) || \
+ defined(CONFIG_X86) || \
+ defined(CONFIG_XTENSA)
+# define USE_FFS 1
+#elif defined(CONFIG_MIPS)
+# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz)
+#else
+# define USE_FFS 0
+#endif
+
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knith has noted, appears a first-century Chinese math text.)
+ */
unsigned long gcd(unsigned long a, unsigned long b)
{
- unsigned long r;
+ unsigned long r = a | b;
+
+ if (!a || !b)
+ return r;
+
+ if (USE_FFS) {
+ b >>= __ffs(b);
+ } else {
+ /* least-significant mask, equ to "(1UL << ffs(r)) - 1" */
+ r ^= (r - 1);
+
+ while (!(b & r))
+ b >>= 1;
+ }
+
+ for (;;) {
+ if (USE_FFS) {
+ a >>= __ffs(a);
+ } else {
+ while (!(a & r))
+ a >>= 1;
+ }
+ if (a == b)
+ break;

- if (a < b)
- swap(a, b);
+ if (a < b)
+ swap(a, b);
+
+ a -= b;
+ if (!USE_FFS) {
+ a >>= 1;
+ if (a & r)
+ a += b;
+ }
+ }

- if (!b)
- return a;
- while ((r = a % b) != 0) {
- a = b;
- b = r;
+ if (USE_FFS) {
+ b <<= __ffs(r);
}
return b;
}
--
2.5.0



2016-04-27 16:20:38

by George Spelvin

[permalink] [raw]
Subject: Re: [patch V2] lib: GCD: add binary GCD algorithm

I replicated your results in 32- and 64-bit x86:

- If __ffs (gcc's __builtin_ctz) is available, the basic
(non-even/odd) binary GCD algorithm is faster than the
divsion-based or even/odd.
- If shifting down has to be done in a loop, the even/odd
binary algorithm is fastest.

I tried a few variants, but couldn't consistently beat your code.
The following variant, with the loop test at the end, seemed to do better
in some cases, but it's not consistent enought to be sure:

static unsigned long gcd5(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __builtin_ctzl(b);

do {
a >>= __builtin_ctzl(a);
if (a < b)
swap(a, b);
a -= b;
} while (a);
return b << __builtin_ctzl(r);
}


The frequent "if (USE_FFS)" conditions seem to clutter the code.
Would it be easier to read if the two variants were just separated?
The function is short enough that the duplication isn't too severe.

Such as the following.

A few source cleanups (included for example; delete if you don't
like them):
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.
Signed-off-by: George Spelvin <[email protected]>

/*
* This implements the binary GCD algorithm. (Often attributed to Stein,
* but as Knuth has noted, appears a first-century Chinese math text.)
*
* This is faster than the division-based algorithm even on x86, which
* has decent hardware division.
*/
#if USE_FFS
/* If __ffs is available, the even/odd algorithm benchmarks slower. */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

b >>= __ffs(b);

for (;;) {
a >>= __ffs(a);
if (a == b)
return a << __ffs(r);
if (a < b)
swap(a, b);
a -= b;
}
}

#else /* !USE_FFS */

/* If normalization is done by loops, the even/odd algorithm is a win. */
unsigned long gcd(unsigned long a, unsigned long b)
{
unsigned long r = a | b;

if (!a || !b)
return r;

r &= -r; /* Isolate lsbit of r */

while (!(b & r))
b >>= 1;

for (;;) {
while (!(a & r))
a >>= 1;
if (a == b)
return a;
if (a < b)
swap(a, b);
a -= b;
a >>= 1;
if (a & r)
a += b;
a >>= 1;
}
}
#endif

2016-04-27 20:09:01

by Andrew Morton

[permalink] [raw]
Subject: Re: [patch V2] lib: GCD: add binary GCD algorithm

On Wed, 27 Apr 2016 16:05:50 +0800 [email protected] wrote:

> From: Zhaoxiu Zeng <[email protected]>
>
> Because some architectures (alpha, armv6, etc.) don't provide hardware division,
> the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
> it replaces division with arithmetic shifts, comparisons, and subtraction.
>
> I have compiled successfully with x86_64_defconfig and i386_defconfig.
>
> I use the following code to test:
>
> ...
>
> Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got:
>
> zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
> gcd0: elapsed 92281
> gcd1: elapsed 55005
> gcd2: elapsed 91088
> zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
> gcd0: elapsed 115546
> gcd1: elapsed 55928
> gcd2: elapsed 91938
> zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
> gcd0: elapsed 91189
> gcd1: elapsed 55493
> gcd2: elapsed 90078
> zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
> gcd0: elapsed 157364
> gcd1: elapsed 55204
> gcd2: elapsed 90058
> zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd
> gcd0: elapsed 91667
> gcd1: elapsed 54641
> gcd2: elapsed 91364

Can you please include a summary which describes the overall impact of
the patch? eg, how faster does a%b become on <some architecture>

> --- a/lib/gcd.c
> +++ b/lib/gcd.c
> @@ -2,19 +2,82 @@
> #include <linux/gcd.h>
> #include <linux/export.h>
>
> -/* Greatest common divisor */
> +/*
> + * use __ffs if the CPU has efficient __ffs
> + */
> +#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \
> + defined(CONFIG_ARC) || \
> + (defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \
> + defined(CONFIG_AVR32) || \
> + defined(CONFIG_BLACKFIN) || \
> + defined(CONFIG_C6X) || \
> + defined(CONFIG_CRIS) || \
> + defined(CONFIG_FRV) || \
> + defined(CONFIG_HEXAGON) || \
> + defined(CONFIG_IA64) || \
> + (defined(CONFIG_M68K) && \
> + (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \
> + ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \
> + !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \
> + defined(CONFIG_MN10300) || \
> + defined(CONFIG_OPENRISC) || \
> + defined(CONFIG_POWERPC) || \
> + defined(CONFIG_S390) || \
> + defined(CONFIG_TILE) || \
> + defined(CONFIG_UNICORE32) || \
> + defined(CONFIG_X86) || \
> + defined(CONFIG_XTENSA)
> +# define USE_FFS 1
> +#elif defined(CONFIG_MIPS)
> +# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz)
> +#else
> +# define USE_FFS 0
> +#endif

Yikes.

Normally we'd create a new Kconfig variable and do this in the
individual arch/XXX/Kconfig files.

Can we just assume CONFIG_ARCH_HAS_SLOW_FFS is false and then set it to
true for MIPS, arm, etc?

> +/*
> + * This implements the binary GCD algorithm. (Often attributed to Stein,
> + * but as Knith has noted, appears a first-century Chinese math text.)
> + */

Knith might be Knuth?

> unsigned long gcd(unsigned long a, unsigned long b)
> {
> - unsigned long r;
> + unsigned long r = a | b;
> +
> + if (!a || !b)
> + return r;

2016-04-28 07:21:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch V2] lib: GCD: add binary GCD algorithm

On Wed, Apr 27, 2016 at 12:20:33PM -0400, George Spelvin wrote:
> The frequent "if (USE_FFS)" conditions seem to clutter the code.
> Would it be easier to read if the two variants were just separated?
> The function is short enough that the duplication isn't too severe.

Yes please, this is _MUCH_ more readable.

> #if USE_FFS
> /* If __ffs is available, the even/odd algorithm benchmarks slower. */
> unsigned long gcd(unsigned long a, unsigned long b)
> {
> unsigned long r = a | b;
>
> if (!a || !b)
> return r;
>
> b >>= __ffs(b);
>
> for (;;) {
> a >>= __ffs(a);
> if (a == b)
> return a << __ffs(r);
> if (a < b)
> swap(a, b);
> a -= b;
> }
> }
>
> #else /* !USE_FFS */
>
> /* If normalization is done by loops, the even/odd algorithm is a win. */
> unsigned long gcd(unsigned long a, unsigned long b)
> {
> unsigned long r = a | b;
>
> if (!a || !b)
> return r;
>
> r &= -r; /* Isolate lsbit of r */
>
> while (!(b & r))
> b >>= 1;
>
> for (;;) {
> while (!(a & r))
> a >>= 1;
> if (a == b)
> return a;
> if (a < b)
> swap(a, b);
> a -= b;
> a >>= 1;
> if (a & r)
> a += b;
> a >>= 1;
> }
> }
> #endif