2015-07-16 12:40:23

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

This patch deinlines jhash, jhash2 and __jhash_nwords.

It also removes rhashtable_jhash2(key, length, seed)
because it was merely calling jhash2(key, length, seed).

With this .config: http://busybox.net/~vda/kernel_config,
after deinlining these functions have sizes and callsite counts
as follows:

__jhash_nwords: 72 bytes, 75 calls
jhash: 297 bytes, 111 calls
jhash2: 205 bytes, 136 calls

Total size decrease is about 38,000 bytes:

text data bss dec hex filename
90663567 17221960 36659200 144544727 89d93d7 vmlinux5
90625577 17221864 36659200 144506641 89cff11 vmlinux.after

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Thomas Graf <[email protected]>
CC: Alexander Duyck <[email protected]>
CC: Jozsef Kadlecsik <[email protected]>
CC: Herbert Xu <[email protected]>
CC: [email protected]
CC: [email protected]
---
Changes in v2: created a new source file, jhash.c

include/linux/jhash.h | 123 +----------------------------------------
lib/Makefile | 2 +-
lib/jhash.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
lib/rhashtable.c | 13 +++--
4 files changed, 160 insertions(+), 127 deletions(-)
create mode 100644 lib/jhash.c

diff --git a/include/linux/jhash.h b/include/linux/jhash.h
index 348c6f4..0b3f55d 100644
--- a/include/linux/jhash.h
+++ b/include/linux/jhash.h
@@ -31,131 +31,14 @@
/* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
#define jhash_mask(n) (jhash_size(n)-1)

-/* __jhash_mix -- mix 3 32-bit values reversibly. */
-#define __jhash_mix(a, b, c) \
-{ \
- a -= c; a ^= rol32(c, 4); c += b; \
- b -= a; b ^= rol32(a, 6); a += c; \
- c -= b; c ^= rol32(b, 8); b += a; \
- a -= c; a ^= rol32(c, 16); c += b; \
- b -= a; b ^= rol32(a, 19); a += c; \
- c -= b; c ^= rol32(b, 4); b += a; \
-}
-
-/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
-#define __jhash_final(a, b, c) \
-{ \
- c ^= b; c -= rol32(b, 14); \
- a ^= c; a -= rol32(c, 11); \
- b ^= a; b -= rol32(a, 25); \
- c ^= b; c -= rol32(b, 16); \
- a ^= c; a -= rol32(c, 4); \
- b ^= a; b -= rol32(a, 14); \
- c ^= b; c -= rol32(b, 24); \
-}
-
/* An arbitrary initial parameter */
#define JHASH_INITVAL 0xdeadbeef

-/* jhash - hash an arbitrary key
- * @k: sequence of bytes as key
- * @length: the length of the key
- * @initval: the previous hash, or an arbitray value
- *
- * The generic version, hashes an arbitrary sequence of bytes.
- * No alignment or length assumptions are made about the input key.
- *
- * Returns the hash value of the key. The result depends on endianness.
- */
-static inline u32 jhash(const void *key, u32 length, u32 initval)
-{
- u32 a, b, c;
- const u8 *k = key;
-
- /* Set up the internal state */
- a = b = c = JHASH_INITVAL + length + initval;
-
- /* All but the last block: affect some 32 bits of (a,b,c) */
- while (length > 12) {
- a += __get_unaligned_cpu32(k);
- b += __get_unaligned_cpu32(k + 4);
- c += __get_unaligned_cpu32(k + 8);
- __jhash_mix(a, b, c);
- length -= 12;
- k += 12;
- }
- /* Last block: affect all 32 bits of (c) */
- /* All the case statements fall through */
- switch (length) {
- case 12: c += (u32)k[11]<<24;
- case 11: c += (u32)k[10]<<16;
- case 10: c += (u32)k[9]<<8;
- case 9: c += k[8];
- case 8: b += (u32)k[7]<<24;
- case 7: b += (u32)k[6]<<16;
- case 6: b += (u32)k[5]<<8;
- case 5: b += k[4];
- case 4: a += (u32)k[3]<<24;
- case 3: a += (u32)k[2]<<16;
- case 2: a += (u32)k[1]<<8;
- case 1: a += k[0];
- __jhash_final(a, b, c);
- case 0: /* Nothing left to add */
- break;
- }
-
- return c;
-}
-
-/* jhash2 - hash an array of u32's
- * @k: the key which must be an array of u32's
- * @length: the number of u32's in the key
- * @initval: the previous hash, or an arbitray value
- *
- * Returns the hash value of the key.
- */
-static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
-{
- u32 a, b, c;
-
- /* Set up the internal state */
- a = b = c = JHASH_INITVAL + (length<<2) + initval;
-
- /* Handle most of the key */
- while (length > 3) {
- a += k[0];
- b += k[1];
- c += k[2];
- __jhash_mix(a, b, c);
- length -= 3;
- k += 3;
- }
-
- /* Handle the last 3 u32's: all the case statements fall through */
- switch (length) {
- case 3: c += k[2];
- case 2: b += k[1];
- case 1: a += k[0];
- __jhash_final(a, b, c);
- case 0: /* Nothing left to add */
- break;
- }
-
- return c;
-}
-
+u32 jhash(const void *key, u32 length, u32 initval);
+u32 jhash2(const u32 *k, u32 length, u32 initval);

/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
-static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
-{
- a += initval;
- b += initval;
- c += initval;
-
- __jhash_final(a, b, c);
-
- return c;
-}
+u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval);

static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
{
diff --git a/lib/Makefile b/lib/Makefile
index 6897b52..978be53 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
- percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
+ percpu-refcount.o percpu_ida.o jhash.o rhashtable.o reciprocal_div.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
diff --git a/lib/jhash.c b/lib/jhash.c
new file mode 100644
index 0000000..cf3c277
--- /dev/null
+++ b/lib/jhash.c
@@ -0,0 +1,149 @@
+/* Jenkins hash support.
+ *
+ * Copyright (C) 2006. Bob Jenkins ([email protected])
+ *
+ * http://burtleburtle.net/bob/hash/
+ *
+ * These are the credits from Bob's sources:
+ *
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * These are functions for producing 32-bit hashes for hash table lookup.
+ * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+ * are externally useful functions. Routines to test the hash are included
+ * if SELF_TEST is defined. You can use this free for any purpose. It's in
+ * the public domain. It has no warranty.
+ *
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik ([email protected])
+ *
+ * I've modified Bob's hash to be useful in the Linux kernel, and
+ * any bugs present are my fault.
+ * Jozsef
+ */
+#include <linux/export.h>
+#include <linux/jhash.h>
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+/* jhash - hash an arbitrary key
+ * @k: sequence of bytes as key
+ * @length: the length of the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * The generic version, hashes an arbitrary sequence of bytes.
+ * No alignment or length assumptions are made about the input key.
+ *
+ * Returns the hash value of the key. The result depends on endianness.
+ */
+u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const u8 *k = key;
+
+ /* Set up the internal state */
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ /* All but the last block: affect some 32 bits of (a,b,c) */
+ while (length > 12) {
+ a += __get_unaligned_cpu32(k);
+ b += __get_unaligned_cpu32(k + 4);
+ c += __get_unaligned_cpu32(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ /* Last block: affect all 32 bits of (c) */
+ /* All the case statements fall through */
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+EXPORT_SYMBOL_GPL(jhash);
+
+/* jhash2 - hash an array of u32's
+ * @k: the key which must be an array of u32's
+ * @length: the number of u32's in the key
+ * @initval: the previous hash, or an arbitray value
+ *
+ * Returns the hash value of the key.
+ */
+u32 jhash2(const u32 *k, u32 length, u32 initval)
+{
+ u32 a, b, c;
+
+ /* Set up the internal state */
+ a = b = c = JHASH_INITVAL + (length<<2) + initval;
+
+ /* Handle most of the key */
+ while (length > 3) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ __jhash_mix(a, b, c);
+ length -= 3;
+ k += 3;
+ }
+
+ /* Handle the last 3 u32's: all the case statements fall through */
+ switch (length) {
+ case 3: c += k[2];
+ case 2: b += k[1];
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+EXPORT_SYMBOL_GPL(jhash2);
+
+/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
+u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+
+ __jhash_final(a, b, c);
+
+ return c;
+}
+EXPORT_SYMBOL_GPL(__jhash_nwords);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index cc0c697..cf429c5 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -663,11 +663,6 @@ static size_t rounded_hashtable_size(const struct rhashtable_params *params)
(unsigned long)params->min_size);
}

-static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
-{
- return jhash2(key, length, seed);
-}
-
/**
* rhashtable_init - initialize a new hash table
* @ht: hash table to be initialized
@@ -773,8 +768,14 @@ int rhashtable_init(struct rhashtable *ht,
ht->p.hashfn = jhash;

if (!(ht->key_len & (sizeof(u32) - 1))) {
+ typedef u32 (*hashfunc)(const void *k, u32 length, u32 initval);
+
ht->key_len /= sizeof(u32);
- ht->p.hashfn = rhashtable_jhash2;
+ /*
+ * jhash2() 1st param is const u32*,
+ * p.hashfn wants const void*. Hence the cast:
+ */
+ ht->p.hashfn = (hashfunc)jhash2;
}
}

--
1.8.1.4


2015-07-16 14:26:27

by Alexander Duyck

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

On 07/16/2015 05:40 AM, Denys Vlasenko wrote:
> This patch deinlines jhash, jhash2 and __jhash_nwords.
>
> It also removes rhashtable_jhash2(key, length, seed)
> because it was merely calling jhash2(key, length, seed).
>
> With this .config: http://busybox.net/~vda/kernel_config,
> after deinlining these functions have sizes and callsite counts
> as follows:
>
> __jhash_nwords: 72 bytes, 75 calls
> jhash: 297 bytes, 111 calls
> jhash2: 205 bytes, 136 calls
>
> Total size decrease is about 38,000 bytes:
>
> text data bss dec hex filename
> 90663567 17221960 36659200 144544727 89d93d7 vmlinux5
> 90625577 17221864 36659200 144506641 89cff11 vmlinux.after
>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Thomas Graf <[email protected]>
> CC: Alexander Duyck <[email protected]>
> CC: Jozsef Kadlecsik <[email protected]>
> CC: Herbert Xu <[email protected]>
> CC: [email protected]
> CC: [email protected]
> ---
> Changes in v2: created a new source file, jhash.c
>
> include/linux/jhash.h | 123 +----------------------------------------
> lib/Makefile | 2 +-
> lib/jhash.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
> lib/rhashtable.c | 13 +++--
> 4 files changed, 160 insertions(+), 127 deletions(-)
> create mode 100644 lib/jhash.c
>
> diff --git a/include/linux/jhash.h b/include/linux/jhash.h
> index 348c6f4..0b3f55d 100644
> --- a/include/linux/jhash.h
> +++ b/include/linux/jhash.h
> @@ -31,131 +31,14 @@
> /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
> #define jhash_mask(n) (jhash_size(n)-1)
>
> -/* __jhash_mix -- mix 3 32-bit values reversibly. */
> -#define __jhash_mix(a, b, c) \
> -{ \
> - a -= c; a ^= rol32(c, 4); c += b; \
> - b -= a; b ^= rol32(a, 6); a += c; \
> - c -= b; c ^= rol32(b, 8); b += a; \
> - a -= c; a ^= rol32(c, 16); c += b; \
> - b -= a; b ^= rol32(a, 19); a += c; \
> - c -= b; c ^= rol32(b, 4); b += a; \
> -}
> -
> -/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
> -#define __jhash_final(a, b, c) \
> -{ \
> - c ^= b; c -= rol32(b, 14); \
> - a ^= c; a -= rol32(c, 11); \
> - b ^= a; b -= rol32(a, 25); \
> - c ^= b; c -= rol32(b, 16); \
> - a ^= c; a -= rol32(c, 4); \
> - b ^= a; b -= rol32(a, 14); \
> - c ^= b; c -= rol32(b, 24); \
> -}
> -
> /* An arbitrary initial parameter */
> #define JHASH_INITVAL 0xdeadbeef
>
> -/* jhash - hash an arbitrary key
> - * @k: sequence of bytes as key
> - * @length: the length of the key
> - * @initval: the previous hash, or an arbitray value
> - *
> - * The generic version, hashes an arbitrary sequence of bytes.
> - * No alignment or length assumptions are made about the input key.
> - *
> - * Returns the hash value of the key. The result depends on endianness.
> - */
> -static inline u32 jhash(const void *key, u32 length, u32 initval)
> -{
> - u32 a, b, c;
> - const u8 *k = key;
> -
> - /* Set up the internal state */
> - a = b = c = JHASH_INITVAL + length + initval;
> -
> - /* All but the last block: affect some 32 bits of (a,b,c) */
> - while (length > 12) {
> - a += __get_unaligned_cpu32(k);
> - b += __get_unaligned_cpu32(k + 4);
> - c += __get_unaligned_cpu32(k + 8);
> - __jhash_mix(a, b, c);
> - length -= 12;
> - k += 12;
> - }
> - /* Last block: affect all 32 bits of (c) */
> - /* All the case statements fall through */
> - switch (length) {
> - case 12: c += (u32)k[11]<<24;
> - case 11: c += (u32)k[10]<<16;
> - case 10: c += (u32)k[9]<<8;
> - case 9: c += k[8];
> - case 8: b += (u32)k[7]<<24;
> - case 7: b += (u32)k[6]<<16;
> - case 6: b += (u32)k[5]<<8;
> - case 5: b += k[4];
> - case 4: a += (u32)k[3]<<24;
> - case 3: a += (u32)k[2]<<16;
> - case 2: a += (u32)k[1]<<8;
> - case 1: a += k[0];
> - __jhash_final(a, b, c);
> - case 0: /* Nothing left to add */
> - break;
> - }
> -
> - return c;
> -}
> -
> -/* jhash2 - hash an array of u32's
> - * @k: the key which must be an array of u32's
> - * @length: the number of u32's in the key
> - * @initval: the previous hash, or an arbitray value
> - *
> - * Returns the hash value of the key.
> - */
> -static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
> -{
> - u32 a, b, c;
> -
> - /* Set up the internal state */
> - a = b = c = JHASH_INITVAL + (length<<2) + initval;
> -
> - /* Handle most of the key */
> - while (length > 3) {
> - a += k[0];
> - b += k[1];
> - c += k[2];
> - __jhash_mix(a, b, c);
> - length -= 3;
> - k += 3;
> - }
> -
> - /* Handle the last 3 u32's: all the case statements fall through */
> - switch (length) {
> - case 3: c += k[2];
> - case 2: b += k[1];
> - case 1: a += k[0];
> - __jhash_final(a, b, c);
> - case 0: /* Nothing left to add */
> - break;
> - }
> -
> - return c;
> -}
> -
> +u32 jhash(const void *key, u32 length, u32 initval);
> +u32 jhash2(const u32 *k, u32 length, u32 initval);
>
> /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
> -static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> -{
> - a += initval;
> - b += initval;
> - c += initval;
> -
> - __jhash_final(a, b, c);
> -
> - return c;
> -}
> +u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval);
>
> static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
> {
> diff --git a/lib/Makefile b/lib/Makefile
> index 6897b52..978be53 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
> gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
> bsearch.o find_bit.o llist.o memweight.o kfifo.o \
> - percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
> + percpu-refcount.o percpu_ida.o jhash.o rhashtable.o reciprocal_div.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> obj-y += hexdump.o
> diff --git a/lib/jhash.c b/lib/jhash.c
> new file mode 100644
> index 0000000..cf3c277
> --- /dev/null
> +++ b/lib/jhash.c
> @@ -0,0 +1,149 @@
> +/* Jenkins hash support.
> + *
> + * Copyright (C) 2006. Bob Jenkins ([email protected])
> + *
> + * http://burtleburtle.net/bob/hash/
> + *
> + * These are the credits from Bob's sources:
> + *
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions. Routines to test the hash are included
> + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> + * the public domain. It has no warranty.
> + *
> + * Copyright (C) 2009-2010 Jozsef Kadlecsik ([email protected])
> + *
> + * I've modified Bob's hash to be useful in the Linux kernel, and
> + * any bugs present are my fault.
> + * Jozsef
> + */
> +#include <linux/export.h>
> +#include <linux/jhash.h>
> +
> +/* __jhash_mix -- mix 3 32-bit values reversibly. */
> +#define __jhash_mix(a, b, c) \
> +{ \
> + a -= c; a ^= rol32(c, 4); c += b; \
> + b -= a; b ^= rol32(a, 6); a += c; \
> + c -= b; c ^= rol32(b, 8); b += a; \
> + a -= c; a ^= rol32(c, 16); c += b; \
> + b -= a; b ^= rol32(a, 19); a += c; \
> + c -= b; c ^= rol32(b, 4); b += a; \
> +}
> +
> +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
> +#define __jhash_final(a, b, c) \
> +{ \
> + c ^= b; c -= rol32(b, 14); \
> + a ^= c; a -= rol32(c, 11); \
> + b ^= a; b -= rol32(a, 25); \
> + c ^= b; c -= rol32(b, 16); \
> + a ^= c; a -= rol32(c, 4); \
> + b ^= a; b -= rol32(a, 14); \
> + c ^= b; c -= rol32(b, 24); \
> +}
> +
> +/* jhash - hash an arbitrary key
> + * @k: sequence of bytes as key
> + * @length: the length of the key
> + * @initval: the previous hash, or an arbitray value
> + *
> + * The generic version, hashes an arbitrary sequence of bytes.
> + * No alignment or length assumptions are made about the input key.
> + *
> + * Returns the hash value of the key. The result depends on endianness.
> + */
> +u32 jhash(const void *key, u32 length, u32 initval)
> +{
> + u32 a, b, c;
> + const u8 *k = key;
> +
> + /* Set up the internal state */
> + a = b = c = JHASH_INITVAL + length + initval;
> +
> + /* All but the last block: affect some 32 bits of (a,b,c) */
> + while (length > 12) {
> + a += __get_unaligned_cpu32(k);
> + b += __get_unaligned_cpu32(k + 4);
> + c += __get_unaligned_cpu32(k + 8);
> + __jhash_mix(a, b, c);
> + length -= 12;
> + k += 12;
> + }
> + /* Last block: affect all 32 bits of (c) */
> + /* All the case statements fall through */
> + switch (length) {
> + case 12: c += (u32)k[11]<<24;
> + case 11: c += (u32)k[10]<<16;
> + case 10: c += (u32)k[9]<<8;
> + case 9: c += k[8];
> + case 8: b += (u32)k[7]<<24;
> + case 7: b += (u32)k[6]<<16;
> + case 6: b += (u32)k[5]<<8;
> + case 5: b += k[4];
> + case 4: a += (u32)k[3]<<24;
> + case 3: a += (u32)k[2]<<16;
> + case 2: a += (u32)k[1]<<8;
> + case 1: a += k[0];
> + __jhash_final(a, b, c);
> + case 0: /* Nothing left to add */
> + break;
> + }
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(jhash);
> +

I think this should really be EXPORT_SYMBOL, not EXPORT_SYMBOL_GPL. The
fact is the code this is based on is public domain so there isn't much
point in requiring GPL as they could just go grab Bob's original code
and have a hash function put together pretty quick.

> +/* jhash2 - hash an array of u32's
> + * @k: the key which must be an array of u32's
> + * @length: the number of u32's in the key
> + * @initval: the previous hash, or an arbitray value
> + *
> + * Returns the hash value of the key.
> + */
> +u32 jhash2(const u32 *k, u32 length, u32 initval)
> +{
> + u32 a, b, c;
> +
> + /* Set up the internal state */
> + a = b = c = JHASH_INITVAL + (length<<2) + initval;
> +
> + /* Handle most of the key */
> + while (length > 3) {
> + a += k[0];
> + b += k[1];
> + c += k[2];
> + __jhash_mix(a, b, c);
> + length -= 3;
> + k += 3;
> + }
> +
> + /* Handle the last 3 u32's: all the case statements fall through */
> + switch (length) {
> + case 3: c += k[2];
> + case 2: b += k[1];
> + case 1: a += k[0];
> + __jhash_final(a, b, c);
> + case 0: /* Nothing left to add */
> + break;
> + }
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(jhash2);
> +

Same thing here.

> +/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
> +u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> +{
> + a += initval;
> + b += initval;
> + c += initval;
> +
> + __jhash_final(a, b, c);
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(__jhash_nwords);

This one too.

2015-07-16 15:43:27

by Tom Herbert

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

On Thu, Jul 16, 2015 at 5:40 AM, Denys Vlasenko <[email protected]> wrote:
> This patch deinlines jhash, jhash2 and __jhash_nwords.
>
> It also removes rhashtable_jhash2(key, length, seed)
> because it was merely calling jhash2(key, length, seed).
>
> With this .config: http://busybox.net/~vda/kernel_config,
> after deinlining these functions have sizes and callsite counts
> as follows:
>
> __jhash_nwords: 72 bytes, 75 calls
> jhash: 297 bytes, 111 calls
> jhash2: 205 bytes, 136 calls
>
jhash is used in several places in the critical data path. Does the
decrease in text size justify performance impact of not inlining it?

Tom

> Total size decrease is about 38,000 bytes:
>
> text data bss dec hex filename
> 90663567 17221960 36659200 144544727 89d93d7 vmlinux5
> 90625577 17221864 36659200 144506641 89cff11 vmlinux.after
>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Thomas Graf <[email protected]>
> CC: Alexander Duyck <[email protected]>
> CC: Jozsef Kadlecsik <[email protected]>
> CC: Herbert Xu <[email protected]>
> CC: [email protected]
> CC: [email protected]
> ---
> Changes in v2: created a new source file, jhash.c
>
> include/linux/jhash.h | 123 +----------------------------------------
> lib/Makefile | 2 +-
> lib/jhash.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
> lib/rhashtable.c | 13 +++--
> 4 files changed, 160 insertions(+), 127 deletions(-)
> create mode 100644 lib/jhash.c
>
> diff --git a/include/linux/jhash.h b/include/linux/jhash.h
> index 348c6f4..0b3f55d 100644
> --- a/include/linux/jhash.h
> +++ b/include/linux/jhash.h
> @@ -31,131 +31,14 @@
> /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */
> #define jhash_mask(n) (jhash_size(n)-1)
>
> -/* __jhash_mix -- mix 3 32-bit values reversibly. */
> -#define __jhash_mix(a, b, c) \
> -{ \
> - a -= c; a ^= rol32(c, 4); c += b; \
> - b -= a; b ^= rol32(a, 6); a += c; \
> - c -= b; c ^= rol32(b, 8); b += a; \
> - a -= c; a ^= rol32(c, 16); c += b; \
> - b -= a; b ^= rol32(a, 19); a += c; \
> - c -= b; c ^= rol32(b, 4); b += a; \
> -}
> -
> -/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
> -#define __jhash_final(a, b, c) \
> -{ \
> - c ^= b; c -= rol32(b, 14); \
> - a ^= c; a -= rol32(c, 11); \
> - b ^= a; b -= rol32(a, 25); \
> - c ^= b; c -= rol32(b, 16); \
> - a ^= c; a -= rol32(c, 4); \
> - b ^= a; b -= rol32(a, 14); \
> - c ^= b; c -= rol32(b, 24); \
> -}
> -
> /* An arbitrary initial parameter */
> #define JHASH_INITVAL 0xdeadbeef
>
> -/* jhash - hash an arbitrary key
> - * @k: sequence of bytes as key
> - * @length: the length of the key
> - * @initval: the previous hash, or an arbitray value
> - *
> - * The generic version, hashes an arbitrary sequence of bytes.
> - * No alignment or length assumptions are made about the input key.
> - *
> - * Returns the hash value of the key. The result depends on endianness.
> - */
> -static inline u32 jhash(const void *key, u32 length, u32 initval)
> -{
> - u32 a, b, c;
> - const u8 *k = key;
> -
> - /* Set up the internal state */
> - a = b = c = JHASH_INITVAL + length + initval;
> -
> - /* All but the last block: affect some 32 bits of (a,b,c) */
> - while (length > 12) {
> - a += __get_unaligned_cpu32(k);
> - b += __get_unaligned_cpu32(k + 4);
> - c += __get_unaligned_cpu32(k + 8);
> - __jhash_mix(a, b, c);
> - length -= 12;
> - k += 12;
> - }
> - /* Last block: affect all 32 bits of (c) */
> - /* All the case statements fall through */
> - switch (length) {
> - case 12: c += (u32)k[11]<<24;
> - case 11: c += (u32)k[10]<<16;
> - case 10: c += (u32)k[9]<<8;
> - case 9: c += k[8];
> - case 8: b += (u32)k[7]<<24;
> - case 7: b += (u32)k[6]<<16;
> - case 6: b += (u32)k[5]<<8;
> - case 5: b += k[4];
> - case 4: a += (u32)k[3]<<24;
> - case 3: a += (u32)k[2]<<16;
> - case 2: a += (u32)k[1]<<8;
> - case 1: a += k[0];
> - __jhash_final(a, b, c);
> - case 0: /* Nothing left to add */
> - break;
> - }
> -
> - return c;
> -}
> -
> -/* jhash2 - hash an array of u32's
> - * @k: the key which must be an array of u32's
> - * @length: the number of u32's in the key
> - * @initval: the previous hash, or an arbitray value
> - *
> - * Returns the hash value of the key.
> - */
> -static inline u32 jhash2(const u32 *k, u32 length, u32 initval)
> -{
> - u32 a, b, c;
> -
> - /* Set up the internal state */
> - a = b = c = JHASH_INITVAL + (length<<2) + initval;
> -
> - /* Handle most of the key */
> - while (length > 3) {
> - a += k[0];
> - b += k[1];
> - c += k[2];
> - __jhash_mix(a, b, c);
> - length -= 3;
> - k += 3;
> - }
> -
> - /* Handle the last 3 u32's: all the case statements fall through */
> - switch (length) {
> - case 3: c += k[2];
> - case 2: b += k[1];
> - case 1: a += k[0];
> - __jhash_final(a, b, c);
> - case 0: /* Nothing left to add */
> - break;
> - }
> -
> - return c;
> -}
> -
> +u32 jhash(const void *key, u32 length, u32 initval);
> +u32 jhash2(const u32 *k, u32 length, u32 initval);
>
> /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
> -static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> -{
> - a += initval;
> - b += initval;
> - c += initval;
> -
> - __jhash_final(a, b, c);
> -
> - return c;
> -}
> +u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval);
>
> static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval)
> {
> diff --git a/lib/Makefile b/lib/Makefile
> index 6897b52..978be53 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -26,7 +26,7 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
> bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
> gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
> bsearch.o find_bit.o llist.o memweight.o kfifo.o \
> - percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
> + percpu-refcount.o percpu_ida.o jhash.o rhashtable.o reciprocal_div.o
> obj-y += string_helpers.o
> obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
> obj-y += hexdump.o
> diff --git a/lib/jhash.c b/lib/jhash.c
> new file mode 100644
> index 0000000..cf3c277
> --- /dev/null
> +++ b/lib/jhash.c
> @@ -0,0 +1,149 @@
> +/* Jenkins hash support.
> + *
> + * Copyright (C) 2006. Bob Jenkins ([email protected])
> + *
> + * http://burtleburtle.net/bob/hash/
> + *
> + * These are the credits from Bob's sources:
> + *
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions. Routines to test the hash are included
> + * if SELF_TEST is defined. You can use this free for any purpose. It's in
> + * the public domain. It has no warranty.
> + *
> + * Copyright (C) 2009-2010 Jozsef Kadlecsik ([email protected])
> + *
> + * I've modified Bob's hash to be useful in the Linux kernel, and
> + * any bugs present are my fault.
> + * Jozsef
> + */
> +#include <linux/export.h>
> +#include <linux/jhash.h>
> +
> +/* __jhash_mix -- mix 3 32-bit values reversibly. */
> +#define __jhash_mix(a, b, c) \
> +{ \
> + a -= c; a ^= rol32(c, 4); c += b; \
> + b -= a; b ^= rol32(a, 6); a += c; \
> + c -= b; c ^= rol32(b, 8); b += a; \
> + a -= c; a ^= rol32(c, 16); c += b; \
> + b -= a; b ^= rol32(a, 19); a += c; \
> + c -= b; c ^= rol32(b, 4); b += a; \
> +}
> +
> +/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
> +#define __jhash_final(a, b, c) \
> +{ \
> + c ^= b; c -= rol32(b, 14); \
> + a ^= c; a -= rol32(c, 11); \
> + b ^= a; b -= rol32(a, 25); \
> + c ^= b; c -= rol32(b, 16); \
> + a ^= c; a -= rol32(c, 4); \
> + b ^= a; b -= rol32(a, 14); \
> + c ^= b; c -= rol32(b, 24); \
> +}
> +
> +/* jhash - hash an arbitrary key
> + * @k: sequence of bytes as key
> + * @length: the length of the key
> + * @initval: the previous hash, or an arbitray value
> + *
> + * The generic version, hashes an arbitrary sequence of bytes.
> + * No alignment or length assumptions are made about the input key.
> + *
> + * Returns the hash value of the key. The result depends on endianness.
> + */
> +u32 jhash(const void *key, u32 length, u32 initval)
> +{
> + u32 a, b, c;
> + const u8 *k = key;
> +
> + /* Set up the internal state */
> + a = b = c = JHASH_INITVAL + length + initval;
> +
> + /* All but the last block: affect some 32 bits of (a,b,c) */
> + while (length > 12) {
> + a += __get_unaligned_cpu32(k);
> + b += __get_unaligned_cpu32(k + 4);
> + c += __get_unaligned_cpu32(k + 8);
> + __jhash_mix(a, b, c);
> + length -= 12;
> + k += 12;
> + }
> + /* Last block: affect all 32 bits of (c) */
> + /* All the case statements fall through */
> + switch (length) {
> + case 12: c += (u32)k[11]<<24;
> + case 11: c += (u32)k[10]<<16;
> + case 10: c += (u32)k[9]<<8;
> + case 9: c += k[8];
> + case 8: b += (u32)k[7]<<24;
> + case 7: b += (u32)k[6]<<16;
> + case 6: b += (u32)k[5]<<8;
> + case 5: b += k[4];
> + case 4: a += (u32)k[3]<<24;
> + case 3: a += (u32)k[2]<<16;
> + case 2: a += (u32)k[1]<<8;
> + case 1: a += k[0];
> + __jhash_final(a, b, c);
> + case 0: /* Nothing left to add */
> + break;
> + }
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(jhash);
> +
> +/* jhash2 - hash an array of u32's
> + * @k: the key which must be an array of u32's
> + * @length: the number of u32's in the key
> + * @initval: the previous hash, or an arbitray value
> + *
> + * Returns the hash value of the key.
> + */
> +u32 jhash2(const u32 *k, u32 length, u32 initval)
> +{
> + u32 a, b, c;
> +
> + /* Set up the internal state */
> + a = b = c = JHASH_INITVAL + (length<<2) + initval;
> +
> + /* Handle most of the key */
> + while (length > 3) {
> + a += k[0];
> + b += k[1];
> + c += k[2];
> + __jhash_mix(a, b, c);
> + length -= 3;
> + k += 3;
> + }
> +
> + /* Handle the last 3 u32's: all the case statements fall through */
> + switch (length) {
> + case 3: c += k[2];
> + case 2: b += k[1];
> + case 1: a += k[0];
> + __jhash_final(a, b, c);
> + case 0: /* Nothing left to add */
> + break;
> + }
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(jhash2);
> +
> +/* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
> +u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
> +{
> + a += initval;
> + b += initval;
> + c += initval;
> +
> + __jhash_final(a, b, c);
> +
> + return c;
> +}
> +EXPORT_SYMBOL_GPL(__jhash_nwords);
> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
> index cc0c697..cf429c5 100644
> --- a/lib/rhashtable.c
> +++ b/lib/rhashtable.c
> @@ -663,11 +663,6 @@ static size_t rounded_hashtable_size(const struct rhashtable_params *params)
> (unsigned long)params->min_size);
> }
>
> -static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
> -{
> - return jhash2(key, length, seed);
> -}
> -
> /**
> * rhashtable_init - initialize a new hash table
> * @ht: hash table to be initialized
> @@ -773,8 +768,14 @@ int rhashtable_init(struct rhashtable *ht,
> ht->p.hashfn = jhash;
>
> if (!(ht->key_len & (sizeof(u32) - 1))) {
> + typedef u32 (*hashfunc)(const void *k, u32 length, u32 initval);
> +
> ht->key_len /= sizeof(u32);
> - ht->p.hashfn = rhashtable_jhash2;
> + /*
> + * jhash2() 1st param is const u32*,
> + * p.hashfn wants const void*. Hence the cast:
> + */
> + ht->p.hashfn = (hashfunc)jhash2;
> }
> }
>
> --
> 1.8.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html

2015-07-16 18:17:32

by David Miller

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

From: Tom Herbert <[email protected]>
Date: Thu, 16 Jul 2015 08:43:25 -0700

> On Thu, Jul 16, 2015 at 5:40 AM, Denys Vlasenko <[email protected]> wrote:
>> This patch deinlines jhash, jhash2 and __jhash_nwords.
>>
>> It also removes rhashtable_jhash2(key, length, seed)
>> because it was merely calling jhash2(key, length, seed).
>>
>> With this .config: http://busybox.net/~vda/kernel_config,
>> after deinlining these functions have sizes and callsite counts
>> as follows:
>>
>> __jhash_nwords: 72 bytes, 75 calls
>> jhash: 297 bytes, 111 calls
>> jhash2: 205 bytes, 136 calls
>>
> jhash is used in several places in the critical data path. Does the
> decrease in text size justify performance impact of not inlining it?

Tom took the words right out of my mouth.

Denys, you keep making deinlining changes like this all the time, like
a robot. But I never see you make any effort to look into the performance
nor code generation ramifications of your changes.

And frankly that makes your patches quite tiring to deal with.

Your changes potentially have large performance implications, yet you
don't put any effort into considering that aspect at all.

2015-07-16 19:23:37

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

On Thu, 2015-07-16 at 11:17 -0700, David Miller wrote:
> From: Tom Herbert <[email protected]>
> Date: Thu, 16 Jul 2015 08:43:25 -0700
>
> > On Thu, Jul 16, 2015 at 5:40 AM, Denys Vlasenko <[email protected]> wrote:
> >> This patch deinlines jhash, jhash2 and __jhash_nwords.
> >>
> >> It also removes rhashtable_jhash2(key, length, seed)
> >> because it was merely calling jhash2(key, length, seed).
> >>
> >> With this .config: http://busybox.net/~vda/kernel_config,
> >> after deinlining these functions have sizes and callsite counts
> >> as follows:
> >>
> >> __jhash_nwords: 72 bytes, 75 calls
> >> jhash: 297 bytes, 111 calls
> >> jhash2: 205 bytes, 136 calls
> >>
> > jhash is used in several places in the critical data path. Does the
> > decrease in text size justify performance impact of not inlining it?
>
> Tom took the words right out of my mouth.
>
> Denys, you keep making deinlining changes like this all the time, like
> a robot. But I never see you make any effort to look into the performance
> nor code generation ramifications of your changes.
>
> And frankly that makes your patches quite tiring to deal with.
>
> Your changes potentially have large performance implications, yet you
> don't put any effort into considering that aspect at all.

It might be useful to have these performance impacting
changes guarded by something like CONFIG_CC_OPTIMIZE_FOR_SIZE
with another static __always_inline __<func> and a function &
EXPORT_SYMBOL or just a static inline so that where code size
is critical it's uninlined.

Though even for tiny embedded uses, the additional code
complexity might not be worth it.

2015-07-17 13:44:51

by Hagen Paul Pfeifer

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

> On July 16, 2015 at 9:23 PM Joe Perches <[email protected]> wrote:
>
> It might be useful to have these performance impacting
> changes guarded by something like CONFIG_CC_OPTIMIZE_FOR_SIZE
> with another static __always_inline __<func> and a function &
> EXPORT_SYMBOL or just a static inline so that where code size
> is critical it's uninlined.

But keep in mind that jhash, jhash2 and __jhash_nwords are *not*
one-instruction long functions. We duplicate code over and over resulting
probably in more cache misses. __always_inline__ is probably too strict
and a vanilla inline is already for 99% of all distribution builds a
__always_inline__, see ARCH_SUPPORTS_OPTIMIZED_INLINING and
CONFIG_CC_OPTIMIZE_FOR_SIZE.

The answer depends on the specific workload. Sometimes an enforced inline
perform better and sometimes a call is the better solution (read: less
cache misses). General purpose vendors with a larger working set size
should reduce cache misses by deinline many functions. For
high-performance special fast-path operations a strong inlined kernel
build is probably faster. __always_inline__ makes it impossible for the
user to deinline functions or not.

Hagen

2015-07-17 22:54:06

by Joe Perches

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

On Fri, 2015-07-17 at 15:44 +0200, Hagen Paul Pfeifer wrote:
> > On July 16, 2015 at 9:23 PM Joe Perches <[email protected]> wrote:
> >
> > It might be useful to have these performance impacting
> > changes guarded by something like CONFIG_CC_OPTIMIZE_FOR_SIZE
> > with another static __always_inline __<func> and a function &
> > EXPORT_SYMBOL or just a static inline so that where code size
> > is critical it's uninlined.
>
> But keep in mind that jhash, jhash2 and __jhash_nwords are *not*
> one-instruction long functions. We duplicate code over and over resulting
> probably in more cache misses. __always_inline__ is probably too strict
> and a vanilla inline is already for 99% of all distribution builds a
> __always_inline__, see ARCH_SUPPORTS_OPTIMIZED_INLINING and
> CONFIG_CC_OPTIMIZE_FOR_SIZE.

Hello.

Perhaps I wasn't clear/explicit enough.

I tried to suggest using a single __always_inline like this:

Two #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE (or some other
CONFIG_FOO) functions, one in a .h for the existing behavior
and another in a .c for more code space compact uses.

jhash.h

static __always_inline u32 __jhash(const void *key, u32 length, u32 initval)
{
[current jhash implementation...]
}

#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
u32 jhash(const void *key, u32 length, u32 initval);
#else
static inline u32 jhash(const void *key, u32 length, u32 initval)
{
return __jhash(key, length, initval);
}
#endif

jhash.c:

#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
u32 jhash(const void *key, u32 length, u32 initval)
{
return __jhash(key, length, initval);
}
EXPORT_SYMBOL(jhash)
#endif

etc...

Perhaps the additional code complexity is not worth it.

2015-07-19 15:15:00

by Denys Vlasenko

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

On 07/16/2015 08:17 PM, David Miller wrote:
> From: Tom Herbert <[email protected]>
> Date: Thu, 16 Jul 2015 08:43:25 -0700
>
>> On Thu, Jul 16, 2015 at 5:40 AM, Denys Vlasenko <[email protected]> wrote:
>>> This patch deinlines jhash, jhash2 and __jhash_nwords.
>>>
>>> It also removes rhashtable_jhash2(key, length, seed)
>>> because it was merely calling jhash2(key, length, seed).
>>>
>>> With this .config: http://busybox.net/~vda/kernel_config,
>>> after deinlining these functions have sizes and callsite counts
>>> as follows:
>>>
>>> __jhash_nwords: 72 bytes, 75 calls
>>> jhash: 297 bytes, 111 calls
>>> jhash2: 205 bytes, 136 calls
>>>
>> jhash is used in several places in the critical data path. Does the
>> decrease in text size justify performance impact of not inlining it?
>
> Tom took the words right out of my mouth.
>
> Denys, you keep making deinlining changes like this all the time, like
> a robot. But I never see you make any effort to look into the performance
> nor code generation ramifications of your changes.

The performance impact of the call/ret pair on modern x86
CPUs is only 5 cycles. To make a real difference in execution
speed, the function has to be less than 100 bytes of code.

jhash and jhash2, at more than 200 bytes of code, are definitely
far too large for inlining. Here is the smaller of two, jhash2:

<jhash2>:
8d 84 b2 ef be ad de lea -0x21524111(%rdx,%rsi,4),%eax
55 push %rbp
89 c1 mov %eax,%ecx
48 89 e5 mov %rsp,%rbp
89 c2 mov %eax,%edx
eb 62 jmp ffffffff81a0d204 <jhash2+0x73>
03 47 08 add 0x8(%rdi),%eax
03 17 add (%rdi),%edx
83 ee 03 sub $0x3,%esi
03 4f 04 add 0x4(%rdi),%ecx
48 83 c7 0c add $0xc,%rdi
41 89 c0 mov %eax,%r8d
29 c2 sub %eax,%edx
41 c1 c8 1c ror $0x1c,%r8d
01 c8 add %ecx,%eax
44 31 c2 xor %r8d,%edx
41 89 d0 mov %edx,%r8d
29 d1 sub %edx,%ecx
01 c2 add %eax,%edx
41 c1 c8 1a ror $0x1a,%r8d
41 31 c8 xor %ecx,%r8d
44 89 c1 mov %r8d,%ecx
44 29 c0 sub %r8d,%eax
41 01 d0 add %edx,%r8d
c1 c9 18 ror $0x18,%ecx
31 c1 xor %eax,%ecx
89 c8 mov %ecx,%eax
29 ca sub %ecx,%edx
46 8d 0c 01 lea (%rcx,%r8,1),%r9d
c1 c8 10 ror $0x10,%eax
31 d0 xor %edx,%eax
89 c1 mov %eax,%ecx
41 29 c0 sub %eax,%r8d
42 8d 14 08 lea (%rax,%r9,1),%edx
c1 c9 0d ror $0xd,%ecx
44 31 c1 xor %r8d,%ecx
89 c8 mov %ecx,%eax
41 29 c9 sub %ecx,%r9d
01 d1 add %edx,%ecx
c1 c8 1c ror $0x1c,%eax
44 31 c8 xor %r9d,%eax
83 fe 03 cmp $0x3,%esi
77 99 ja ffffffff81a0d1a2 <jhash2+0x11>
83 fe 02 cmp $0x2,%esi
74 0e je ffffffff81a0d21c <jhash2+0x8b>
83 fe 03 cmp $0x3,%esi
74 06 je ffffffff81a0d219 <jhash2+0x88>
ff ce dec %esi
75 45 jne ffffffff81a0d25c <jhash2+0xcb>
eb 06 jmp ffffffff81a0d21f <jhash2+0x8e>
03 47 08 add 0x8(%rdi),%eax
03 4f 04 add 0x4(%rdi),%ecx
89 ce mov %ecx,%esi
03 17 add (%rdi),%edx
31 c8 xor %ecx,%eax
c1 ce 12 ror $0x12,%esi
29 f0 sub %esi,%eax
89 c6 mov %eax,%esi
31 c2 xor %eax,%edx
c1 ce 15 ror $0x15,%esi
29 f2 sub %esi,%edx
89 d6 mov %edx,%esi
31 d1 xor %edx,%ecx
c1 ce 07 ror $0x7,%esi
29 f1 sub %esi,%ecx
89 ce mov %ecx,%esi
31 c8 xor %ecx,%eax
c1 ce 10 ror $0x10,%esi
29 f0 sub %esi,%eax
89 c6 mov %eax,%esi
31 c2 xor %eax,%edx
c1 ce 1c ror $0x1c,%esi
29 f2 sub %esi,%edx
31 d1 xor %edx,%ecx
c1 ca 12 ror $0x12,%edx
29 d1 sub %edx,%ecx
31 c8 xor %ecx,%eax
c1 c9 08 ror $0x8,%ecx
29 c8 sub %ecx,%eax
5d pop %rbp
c3 retq

Yes, I do think that this much code should not be inlined.

__jhash_nwords is smaller and it is used for hashing 2 or 3 32-bit words,
it may be reasonable to inline it.
I'll send a new patch which keeps it inlined.

> Your changes potentially have large performance implications, yet you
> don't put any effort into considering that aspect at all.

I don't know why you think I'm some sort of zealot hell-bent
on deinlining everything. I'm not. I do listen to other people.

For example:
One of my patches was deninlining net_generic(), you proposed
to drop some BUG_ONs in it instead.
I wasn't insisting to have it "my way".
I sent you a patch which removed BUG_ON but left net_generic()
inlined, and you took the patch.

2015-07-19 18:40:48

by David Miller

[permalink] [raw]
Subject: Re: [PATCH v2] jhash: Deinline jhash, jhash2 and __jhash_nwords

From: Denys Vlasenko <[email protected]>
Date: Sun, 19 Jul 2015 17:14:53 +0200

> On 07/16/2015 08:17 PM, David Miller wrote:
>> From: Tom Herbert <[email protected]>
>> Date: Thu, 16 Jul 2015 08:43:25 -0700
>>
>>> On Thu, Jul 16, 2015 at 5:40 AM, Denys Vlasenko <[email protected]> wrote:
>>>> This patch deinlines jhash, jhash2 and __jhash_nwords.
>>>>
>>>> It also removes rhashtable_jhash2(key, length, seed)
>>>> because it was merely calling jhash2(key, length, seed).
>>>>
>>>> With this .config: http://busybox.net/~vda/kernel_config,
>>>> after deinlining these functions have sizes and callsite counts
>>>> as follows:
>>>>
>>>> __jhash_nwords: 72 bytes, 75 calls
>>>> jhash: 297 bytes, 111 calls
>>>> jhash2: 205 bytes, 136 calls
>>>>
>>> jhash is used in several places in the critical data path. Does the
>>> decrease in text size justify performance impact of not inlining it?
>>
>> Tom took the words right out of my mouth.
>>
>> Denys, you keep making deinlining changes like this all the time, like
>> a robot. But I never see you make any effort to look into the performance
>> nor code generation ramifications of your changes.
>
> The performance impact of the call/ret pair on modern x86
> CPUs is only 5 cycles. To make a real difference in execution
> speed, the function has to be less than 100 bytes of code.

What performance metrics have you collected to assert that deinlining
doesn't matter? What networking tests have you done which stress the
socket demux path?

You still aren't addressing any of my concerns.