Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753492AbcD0IHj (ORCPT ); Wed, 27 Apr 2016 04:07:39 -0400 Received: from m50-135.163.com ([123.125.50.135]:60299 "EHLO m50-135.163.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753346AbcD0IH3 (ORCPT ); Wed, 27 Apr 2016 04:07:29 -0400 From: zengzhaoxiu@163.com To: akpm@linux-foundation.org, linux@horizon.com, peterz@infradead.org Cc: linux-kernel@vger.kernel.org, Zhaoxiu Zeng Subject: [patch V2] lib: GCD: add binary GCD algorithm Date: Wed, 27 Apr 2016 16:05:50 +0800 Message-Id: <1461744350-24156-1-git-send-email-zengzhaoxiu@163.com> X-Mailer: git-send-email 2.5.0 X-CM-TRANSID: D9GowADn7HTgciBXpTqCDQ--.3894S2 X-Coremail-Antispam: 1Uf129KBjvJXoW3WrW7WF18CrWDuw4fAr4kCrg_yoW7tFW7pF Z2k34rtrW0qa47KrWfZr1qvrZIyrZ5GFyUXrW7J34DZF18Aa1rXr1qyrs8XrsFq3yxXa1r A34ftFn8CrZ8ZaDanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2 9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x07jFD73UUUUU= X-Originating-IP: [112.95.225.98] X-CM-SenderInfo: p2hqw6xkdr5xrx6rljoofrz/xtbBygZYgFO+jF-37QAAsw Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6214 Lines: 297 From: Zhaoxiu Zeng Because some architectures (alpha, armv6, etc.) don't provide hardware division, the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations, it replaces division with arithmetic shifts, comparisons, and subtraction. I have compiled successfully with x86_64_defconfig and i386_defconfig. I use the following code to test: #include #include #include #include #define swap(a, b) \ do { \ a ^= b; \ b ^= a; \ a ^= b; \ } while (0) unsigned long gcd0(unsigned long a, unsigned long b) { unsigned long r; if (a < b) { swap(a, b); } if (b == 0) return a; while ((r = a % b) != 0) { a = b; b = r; } return b; } unsigned long gcd1(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); for (;;) { a >>= __builtin_ctzl(a); if (a == b) break; if (a < b) swap(a, b); a -= b; } b <<= __builtin_ctzl(r); return b; } unsigned long gcd2(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r ^= (r - 1); while (!(b & r)) b >>= 1; for (;;) { while (!(a & r)) a >>= 1; if (a == b) break; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; } return b; } static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = { gcd0, gcd1, gcd2, }; #define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0])) #define TEST_TIMES 100 static unsigned long res[TEST_ENTRIES][TEST_TIMES]; #define TIME_T struct timespec static inline struct timespec read_time(void) { struct timespec time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time); return time; } static uint64_t diff_time(struct timespec start, struct timespec end) { struct timespec temp; if ((end.tv_nsec - start.tv_nsec) < 0) { temp.tv_sec = end.tv_sec - start.tv_sec - 1; temp.tv_nsec = 1000000000UL + end.tv_nsec - start.tv_nsec; } else { temp.tv_sec = end.tv_sec - start.tv_sec; temp.tv_nsec = end.tv_nsec - start.tv_nsec; } return temp.tv_sec * 1000000000UL + temp.tv_nsec; } static inline unsigned long get_rand() { if (sizeof(long) == 8) return (unsigned long)rand() << 32 | rand(); else return rand(); } int main(int argc, char **argv) { unsigned int seed = time(0); unsigned int i, j; TIME_T start, end; for (i = 0; i < TEST_ENTRIES; i++) { srand(seed); start = read_time(); for (j = 0; j < TEST_TIMES; j++) res[i][j] = gcd_func[i](get_rand(), get_rand()); end = read_time(); printf("gcd%d: elapsed %lld\n", i, diff_time(start, end)); sleep(1); } for (j = 0; j < TEST_TIMES; j++) { for (i = 1; i < TEST_ENTRIES; i++) { if (res[i][j] != res[0][j]) break; } if (i < TEST_ENTRIES) { fprintf(stderr, "Error: "); for (i = 0; i < TEST_ENTRIES; i++) fprintf(stderr, "res%d %ld%s", i, res[i][j], i < TEST_ENTRIES - 1 ? ", " : "\n"); } } return 0; } Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got: zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 92281 gcd1: elapsed 55005 gcd2: elapsed 91088 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 115546 gcd1: elapsed 55928 gcd2: elapsed 91938 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 91189 gcd1: elapsed 55493 gcd2: elapsed 90078 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 157364 gcd1: elapsed 55204 gcd2: elapsed 90058 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 91667 gcd1: elapsed 54641 gcd2: elapsed 91364 Changes to V1: - Don't touch Kconfig, remove the Euclidean algorithm implementation - Don't use the "even-odd" variant - Use __ffs if the CPU has efficient __ffs Signed-off-by: Zhaoxiu Zeng --- lib/gcd.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 9 deletions(-) diff --git a/lib/gcd.c b/lib/gcd.c index 3657f12..460b414 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -2,19 +2,82 @@ #include #include -/* Greatest common divisor */ +/* + * use __ffs if the CPU has efficient __ffs + */ +#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \ + defined(CONFIG_ARC) || \ + (defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \ + defined(CONFIG_AVR32) || \ + defined(CONFIG_BLACKFIN) || \ + defined(CONFIG_C6X) || \ + defined(CONFIG_CRIS) || \ + defined(CONFIG_FRV) || \ + defined(CONFIG_HEXAGON) || \ + defined(CONFIG_IA64) || \ + (defined(CONFIG_M68K) && \ + (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \ + ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \ + !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \ + defined(CONFIG_MN10300) || \ + defined(CONFIG_OPENRISC) || \ + defined(CONFIG_POWERPC) || \ + defined(CONFIG_S390) || \ + defined(CONFIG_TILE) || \ + defined(CONFIG_UNICORE32) || \ + defined(CONFIG_X86) || \ + defined(CONFIG_XTENSA) +# define USE_FFS 1 +#elif defined(CONFIG_MIPS) +# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz) +#else +# define USE_FFS 0 +#endif + +/* + * This implements the binary GCD algorithm. (Often attributed to Stein, + * but as Knith has noted, appears a first-century Chinese math text.) + */ unsigned long gcd(unsigned long a, unsigned long b) { - unsigned long r; + unsigned long r = a | b; + + if (!a || !b) + return r; + + if (USE_FFS) { + b >>= __ffs(b); + } else { + /* least-significant mask, equ to "(1UL << ffs(r)) - 1" */ + r ^= (r - 1); + + while (!(b & r)) + b >>= 1; + } + + for (;;) { + if (USE_FFS) { + a >>= __ffs(a); + } else { + while (!(a & r)) + a >>= 1; + } + if (a == b) + break; - if (a < b) - swap(a, b); + if (a < b) + swap(a, b); + + a -= b; + if (!USE_FFS) { + a >>= 1; + if (a & r) + a += b; + } + } - if (!b) - return a; - while ((r = a % b) != 0) { - a = b; - b = r; + if (USE_FFS) { + b <<= __ffs(r); } return b; } -- 2.5.0