LinuxLists.cc - [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash

2016-05-02 10:26:58

Subject: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

This also affects hash_str() and hash_mem() in <linux/sunrpc/svcauth.h>.

After a careful scan through the kernel code, no caller asks any of
those four for more than 32 bits of hash result, except that the
latter two need 64 bits from hash_long() if BITS_PER_LONG == 64.

This is in preparation for the following patch, which will create
a new implementation of hash_64 for the BITS_PER_LONG == 32 case
which is optimized for 32-bit machines.

Signed-off-by: George Spelvin <[email protected]>
Cc: "J. Bruce Fields" <[email protected]>
Cc: Jeff Layton <[email protected]>
Cc: [email protected]
---
Cc: to NFS folks because it touches the sunrpc directory.

Is that "TODO" comment too presumptuous of me?

include/linux/hash.h | 22 ++++++++++++++++------
include/linux/sunrpc/svcauth.h | 15 +++++++--------
2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/linux/hash.h b/include/linux/hash.h
index 1afde47e..05003fdc 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -24,15 +24,17 @@

#if BITS_PER_LONG == 32
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define __hash_long(val) __hash_32(val)
#define hash_long(val, bits) hash_32(val, bits)
#elif BITS_PER_LONG == 64
+#define __hash_long(val) __hash_64(val)
#define hash_long(val, bits) hash_64(val, bits)
#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
#else
#error Wordsize not 32 or 64
#endif

-static __always_inline u64 hash_64(u64 val, unsigned int bits)
+static __always_inline u64 __hash_64(u64 val)
{
u64 hash = val;

@@ -55,20 +57,28 @@ static __always_inline u64 hash_64(u64 val, unsigned int bits)
hash += n;
#endif

+ return hash;
+}
+
+static __always_inline u64 hash_64(u64 val, unsigned bits)
+{
/* High bits are more random, so use them. */
- return hash >> (64 - bits);
+ return __hash_64(val) >> (64 - bits);
}

-static inline u32 hash_32(u32 val, unsigned int bits)
+static inline u32 __hash_32(u32 val)
{
/* On some cpus multiply is faster, on others gcc will do shifts */
- u32 hash = val * GOLDEN_RATIO_PRIME_32;
+ return val * GOLDEN_RATIO_PRIME_32;
+}

+static inline u32 hash_32(u32 val, unsigned bits)
+{
/* High bits are more random, so use them. */
- return hash >> (32 - bits);
+ return __hash_32(val) >> (32 - bits);
}

-static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
+static inline u32 hash_ptr(const void *ptr, unsigned bits)
{
return hash_long((unsigned long)ptr, bits);
}
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
index c00f53a4..eb1241b3 100644
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -165,7 +165,8 @@ extern int svcauth_unix_set_client(struct svc_rqst *rqstp);
extern int unix_gid_cache_create(struct net *net);
extern void unix_gid_cache_destroy(struct net *net);

-static inline unsigned long hash_str(char *name, int bits)
+/* TODO: Update to <asm/word-at-a-time.h> when CONFIG_DCACHE_WORD_ACCESS */
+static inline u32 hash_str(const char *name, int bits)
{
unsigned long hash = 0;
unsigned long l = 0;
@@ -176,14 +177,13 @@ static inline unsigned long hash_str(char *name, int bits)
c = (char)len; len = -1;
}
l = (l << 8) | c;
- len++;
- if ((len & (BITS_PER_LONG/8-1))==0)
- hash = hash_long(hash^l, BITS_PER_LONG);
+ if (++len % sizeof(hash) == 0)
+ hash = __hash_long(hash^l);
} while (len);
return hash >> (BITS_PER_LONG - bits);
}

-static inline unsigned long hash_mem(char *buf, int length, int bits)
+static inline u32 hash_mem(const char *buf, int length, int bits)
{
unsigned long hash = 0;
unsigned long l = 0;
@@ -195,9 +195,8 @@ static inline unsigned long hash_mem(char *buf, int length, int bits)
} else
c = *buf++;
l = (l << 8) | c;
- len++;
- if ((len & (BITS_PER_LONG/8-1))==0)
- hash = hash_long(hash^l, BITS_PER_LONG);
+ if (++len % sizeof(hash) == 0)
+ hash = __hash_long(hash^l);
} while (len);
return hash >> (BITS_PER_LONG - bits);
}
--
2.8.1

2016-05-02 10:29:29

by George Spelvin

[permalink] [raw]

Subject: [PATCH 2/2] <linux/hash.h>: Fix hash_64()'s horrible collision problem

hash_64() was using a low-bit-weight multiplier, which resulted in
very bad mixing of the high bits of the input. In particular,
page-aligned pointers (low 12 bits not used) were a disaster,

Since all 64-bit processors (I checked back to the MIPS R4000 and
Alpha 21064) have hardware multipliers and don't benefit from this
"optimization", use the proper golden ratio value.

Avoid performance problems on 32-bit machines by providing a totally
separate implementation for them based on 32-bit arithmetic.

Keep the bad multiplier for hash_32() for now, at Linus' request.
"Sparse in 32 bits" is not quite as bad as "sparse in 64 bits".

Explicitly document that the algorithm is not stable. I've tried to
check all callers for inadvertent dependence on the exact numerical value,
but some them are so confusing (*cough* Lustre *cough*) that I can't
tell for sure.

Reported-by: Thomas Gleixner <[email protected]>
Signed-off-by: George Spelvin <[email protected]>
---
include/linux/hash.h | 151 ++++++++++++++++++++++++++++++++++++---------------
1 file changed, 107 insertions(+), 44 deletions(-)

diff --git a/include/linux/hash.h b/include/linux/hash.h
index 05003fdc..64c44e20 100644
--- a/include/linux/hash.h
+++ b/include/linux/hash.h
@@ -1,63 +1,56 @@
#ifndef _LINUX_HASH_H
#define _LINUX_HASH_H
-/* Fast hashing routine for ints, longs and pointers.
- (C) 2002 Nadia Yvette Chambers, IBM */
-
/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
+ * Fast hashing routine for ints, longs and pointers.
+ * (C) 2002 Nadia Yvette Chambers, IBM
+ *
+ * These are used for small in-memory hash tables, where speed is a
+ * primary concern. If you want something a little bit stronger, see
+ * <linux/jhash.h>, especially functions like jhash_3words(). If your
+ * hash table is subject to a hash collision denial of service attack,
+ * use something cryptographic.
+ *
+ * Note that the algorithms used are not guaranteed stable across kernel
+ * versions or architectures! In particular, hash_64() is implemented
+ * differently on 32- and 64-bit machines. Do not let external behavior
+ * depend on the hash values.
+ *
+ * The algorithm used is straight from Knuth: multiply a w-bit word by
+ * a suitable large constant, and take the high bits of the w-bit result.
+ *
* Chuck Lever verified the effectiveness of this technique:
* http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
*
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
+ * A good reference is Mikkel Thorup, "High Speed Hashing for
+ * Integers and Strings" at http://arxiv.org/abs/1504.06804 and
+ * https://www.youtube.com/watch?v=cB85UZKJQTc
+ *
+ * Because the current algorithm is linear (hash(a+b) = hash(a) + hash(b)),
+ * adding or subtracting hash values is just as likely to cause collisions
+ * as adding or subtracting the keys themselves.
*/
-
#include <asm/types.h>
#include <linux/compiler.h>

-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
-/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
+/*
+ * Although a random odd number will do, it turns out that the golden ratio
+ * phi = (sqrt(5)-1)/2, or its negative, has particularly nice properties.
+ *
+ * These are actually the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull

-#if BITS_PER_LONG == 32
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
-#define __hash_long(val) __hash_32(val)
-#define hash_long(val, bits) hash_32(val, bits)
-#elif BITS_PER_LONG == 64
+#if BITS_PER_LONG == 64
+
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 /* Used in fs/inode.c */
#define __hash_long(val) __hash_64(val)
#define hash_long(val, bits) hash_64(val, bits)
-#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
-#else
-#error Wordsize not 32 or 64
-#endif

static __always_inline u64 __hash_64(u64 val)
{
- u64 hash = val;
-
-#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
- hash = hash * GOLDEN_RATIO_PRIME_64;
-#else
- /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
- u64 n = hash;
- n <<= 18;
- hash -= n;
- n <<= 33;
- hash -= n;
- n <<= 3;
- hash += n;
- n <<= 3;
- hash -= n;
- n <<= 4;
- hash += n;
- n <<= 2;
- hash += n;
-#endif
-
- return hash;
+ return val * GOLDEN_RATIO_64;
}

static __always_inline u64 hash_64(u64 val, unsigned bits)
@@ -66,6 +59,75 @@ static __always_inline u64 hash_64(u64 val, unsigned bits)
return __hash_64(val) >> (64 - bits);
}

+#elif BITS_PER_LONG == 32
+
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32
+#define __hash_long(val) __hash_32(val)
+#define hash_long(val, bits) hash_32(val, bits)
+
+/*
+ * Because 64-bit multiplications are very expensive on 32-bit machines,
+ * provide a completely separate implementation for them.
+ *
+ * This is mostly used via the hash_long() and hash_ptr() wrappers,
+ * which use hash_32() on 32-bit platforms, but there are some direct
+ * users of hash_64() in 32-bit kernels.
+ *
+ * Note that there is no __hash_64 function at all; that exists
+ * only to implement __hash_long().
+ *
+ * The algorithm is somewhat ad-hoc, but achieves decent mixing.
+ */
+static __always_inline u32 hash_64(u64 val, unsigned bits)
+{
+ u32 hash = (uint32)(val >> 32) * GOLDEN_RATIO_32;
+ hash += (uint32)val;
+ hash *= GOLDEN_RATIO_32;
+ return hash >> (32 - bits);
+}
+
+#else /* BITS_PER_LONG is something else */
+#error Wordsize not 32 or 64
+#endif
+
+
+/*
+ * This is the old bastard constant: a low-bit-weight
+ * prime close to 2^32 * phi = 0x9E3779B9.
+ *
+ * The purpose of the low bit weight is to make the shift-and-add
+ * code faster on processors like ARMv2 without hardware multiply.
+ * The downside is that the high bits of the input are hashed very weakly.
+ * In particular, the high 16 bits of input are just shifted up and added,
+ * so if you ask for b < 16 bits of output, bits 16..31-b of the input
+ * barely affect the output.
+ *
+ * Annoyingly, GCC compiles this into 6 shifts and adds, which
+ * is enough to multiply by the full GOLDEN_RATIO_32 using a
+ * cleverer algorithm:
+ *
+ * unsigned hash_32(unsigned x)
+ * {
+ * unsigned y, z;
+ *
+ * y = (x << 19) + x;
+ * z = (x << 9) + y;
+ * x = (x << 23) + z;
+ * z = (z << 8) + y;
+ * x = (x << 6) - x;
+ * return (z << 3) + x;
+ * }
+ *
+ * (Found by Yevgen Voronenko's Hcub algorithm, from
+ * http://spiral.ece.cmu.edu/mcm/gen.html)
+ *
+ * Unfortunately, figuring out which version to compile requires
+ * replicating the compiler's logic in Kconfig or the preprocessor.
+ */
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
+
static inline u32 __hash_32(u32 val)
{
/* On some cpus multiply is faster, on others gcc will do shifts */
@@ -83,6 +145,7 @@ static inline u32 hash_ptr(const void *ptr, unsigned bits)
return hash_long((unsigned long)ptr, bits);
}

+/* This really should be called "fold32_ptr"; it barely hashes at all. */
static inline u32 hash32_ptr(const void *ptr)
{
unsigned long val = (unsigned long)ptr;
--
2.8.1

2016-05-02 13:28:13

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

On Mon, May 02, 2016 at 06:20:16AM -0400, George Spelvin wrote:
> Subject: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

> +static __always_inline u64 hash_64(u64 val, unsigned bits)
> +{
> /* High bits are more random, so use them. */
> + return __hash_64(val) >> (64 - bits);
> }

Is the subject stale or the above a mistake? Because hash_64() still
very much seems to return u64.

Also, I think I would prefer to keep it like this, I would like to use
it for kernel/locking/lockdep.c:iterate_chain_key(), which currently is
a somewhat crap hash.

Something like:

static inline u64 iterate_chain_key(u64 key1, u64 key2)
{
return hash_64(key1 ^ key2, 64);
}

2016-05-02 16:24:23

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

On Mon, May 2, 2016 at 3:20 AM, George Spelvin <[email protected]> wrote:
>
> After a careful scan through the kernel code, no caller asks any of
> those four for more than 32 bits of hash result, except that the
> latter two need 64 bits from hash_long() if BITS_PER_LONG == 64.

Ugh. I hate this patch.

I really think that we should *not* convuse those two odd svcauth.h
users with the whole hash_32/64 thing.

I think hash_str/hash_mem should be moved to lib/hash.c, and they
just shouldn't use "hash_long()" at all, except at the verty end (they
currently have a very odd way of doing "every <n> bytes _and_ at the
end".

In particular, the hashing in the *middle* is very different from the
hashing at the end.

At the end, you need to make sure the lower bits get spread out
particularly to the upper bits, since you're going to shift things
down.

But in the middle, you just want to spread the bits out (and in
particular, destroy any byte-per-byte patterns that it build it in
between).

Quite frankly, I think those functions should just use something like
the FNV hash (or Jenkins hash), and then maybe use "hash_long()" at
the *end* to shift the result down to "bits".

I don't want to make our <linux/hash.h> odder just because of two
entirely broken users.

That said, I actually think hash_str() should be killed entirely.
Better just use what we use for pathnames: full_name_hash() (which
gets a pointer and length) and hash_name (which gets the string).

Those functions do the "word-at-a-time" optimization, and they should
do a good enough job. If they aren't, we should fix them, because they
are a hell of a lot more important than anything that the svcauth code
does.

Linus

2016-05-02 19:08:56

by George Spelvin

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

Peter Zijlstra wrote:
> Is the subject stale or the above a mistake? Because hash_64() still
> very much seems to return u64.

Damn it no, it's a brown-paper-bag typo caused by a recent rebase.
It's meant to be u32, it was developed with u32, but the typo snuck
in during late testing and I didn't catch it.

I know Linus hates recent rebases, but I actually had a good
reason if you want to hear the saga...

I developed the patch while running v4.4.x. I'd been doing other hacking
on top of v4.5 that resulted in an unstable system, so I kept going back
to the "last known good" v4.4.x kernel to get work done.

Developing this patch, I backed out that buggy work and based it on my
4.5 tree, since Linus hates basing work on random kernels.

Most of it was compile testing, but just before submitting, I of course
had to boot it and test.

When I booted it, I discovered I couldn't load microcode. How the hell
did I cause that? Oh, I have CONFIG_MICROCODE turned off... huh? Oh,
v4.5 has bug where CONFIG_MICROCODE depende on CONFIG_BLK_DEV_INITRD
which I don't use, and the fix went in to v4.6-rc1.

Okay, fine, in the interest of getting a clean boot for testing, I'll
rebase to v4.6-rc6. See, I told you I had a reason!

Now, I actually have a fair pile of local patches for hacking projects in
progress (I'm running 4.6.0-rc6-0130), so rebasing my whole stack takes
me about an hour and a half, with several merge conflict resolutions.

Finally, I get my clean boot, and everything seems to be working
fine, and I'm ready to post.

But by this time it's late, I'm tired, and I didn't notice that I somehow
managed to screw that up! In hindsight, I think I remember the sequence
of edits that caused it (I deleted something by accident and cut & pasted
it back), but that's an even more obscure saga.

I will now go and fix it and boot test again, just to be sure.

Grump.

2016-05-02 20:08:05

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 2/2] <linux/hash.h>: Fix hash_64()'s horrible collision problem

On Mon, May 2, 2016 at 3:22 AM, George Spelvin <[email protected]> wrote:
> hash_64() was using a low-bit-weight multiplier, which resulted in
> very bad mixing of the high bits of the input. In particular,
> page-aligned pointers (low 12 bits not used) were a disaster,

So I did just a minimal for fro 4.6 (and back-porting), which took
just the constants and made _only_ the 64-bit architevture case use
this improved constant for hash_64.

In other words, people who use "hash_long()" or use "hash_64()" on
64-bit architectures will get the improvements, but if you use
hash_64() on a 32-bit architecture you'll conteinue to see the old
behavior.

Quite frankly, looking at some of the explicit hash_64() users, they
seem to be a big dubious anyway. And it won't make things *worse* for
them.

So that simple "just use multiplication unconditionally on 64-bit, and
use the better constant" should fix the actual _practical_ problems
that we've seen. And it shouldn't have any negative consequences,
since as you say, 64-bit architectures universally do have a
multiplier.

The bigger changes will have to be for 4.7 by now, I think.

Linus

Attachments:

patch.diff (2.74 kB)

2016-05-02 20:26:39

by George Spelvin

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

Linus wrote:
> I really think that we should *not* convuse those two odd svcauth.h
> users with the whole hash_32/64 thing.
>
> I think hash_str/hash_mem should be moved to lib/hash.c, and they
> just shouldn't use "hash_long()" at all, except at the very end (they
> currently have a very odd way of doing "every <n> bytes _and_ at the
> end".

Moving them is fine. I have no problem with that, except that I agree
that merging them with the fs/namei.c hashes would be even better.

But the hash_long it not odd at all. They're using it as the mix function
between adding words. Like hash_name() uses "hash = (hash + a) * 9".

So it's

x = (first 8 bytes of string)
x = __hash64(x);
x ^= (next 8 bytes of string)
x = __hash64(x);
... repeat ...
x ^= (last 1..8 bytes of string)
return __hash64(x);

It's a quite reasonable mix function. One multiply (4 cycles or so) per
8 bytes. It's definitely swamped by the byte-at-a-time string loading.

(The one peculiar thing is that the "last 1..8 bytes of string" is, due
to the way it's shiften into an accumulator, actually the last 8 bytes,
which may include some previously-hashed bytes. But that has nothing
to do with the mixing.)

But... the fundamental reason I didn't was that this is late -rc.
I'm trying to fix a *bug* tglx found, not add risk and delay with a much
more ambitious patch series.

This is a related but separate issue that can be addressed separately.

> In particular, the hashing in the *middle* is very different from the
> hashing at the end.
>
> At the end, you need to make sure the lower bits get spread out
> particularly to the upper bits, since you're going to shift things
> down.
>
> But in the middle, you just want to spread the bits out (and in
> particular, destroy any byte-per-byte patterns that it build it in
> between).

Yes, in the middle you have a full width hash to spread the bits among,
while at the end you have to "concentrate" the hash value in a few bits.
The latter is a more difficult task and might be slower.

But there's nothing really *wrong* with using the same mix operation
both places if it's both good enough and fast enough.

In particualr, if you're loading the string a word at a time, you need
a much better mixing function than if you're processing it byte at a time.

> Quite frankly, I think those functions should just use something like
> the FNV hash (or Jenkins hash), and then maybe use "hash_long()" at
> the *end* to shift the result down to "bits".

It *is* the FNV hash! More specifically, FNV-1a done a word at a time;
doing it byte at a time like most implementations would result in 8
times as many multiplies and be slower.

The only difference is the exact multiplier. The published FNV-1
uses a low-bit-weight multiplier. Since this is being done a word
at a time, I think the stronger multiplier is worth it.

Jenkins hash is (if you have a HW multiplier) slower. Better mixing,
so adding a multiply at the end would be redundant.

> I don't want to make our <linux/hash.h> odder just because of two
> entirely broken users.

> That said, I actually think hash_str() should be killed entirely.
> Better just use what we use for pathnames: full_name_hash() (which
> gets a pointer and length) and hash_name (which gets the string).

I was thinking the exact same thing. Repeated grepping over the kernel
tree for "hash_*" functions showed me just how many there are in various
places, and combining some would be a good idea.

For example, partial_name_hash() is still used in many places
even if the word-at-a-time code is used in namei.c.

> Those functions do the "word-at-a-time" optimization, and they should
> do a good enough job. If they aren't, we should fix them, because they
> are a hell of a lot more important than anything that the svcauth code
> does.

Okay, let me list the problems...

1) hash_name() stops at a slash or a nul. hash_str() only looks
for a nul. Should I add a third variant? Should that go in fs/namei,
or should the whole pole be moved elsewhere?

2) Some places need the length *and* the hash. Calling strlen() and then
full_name_hash() somewhat defeats the purpose of word-at-a-time access.
hash_name returns both jammed into a 64-bit word. Is that a good
interface in general?

Myself, I think the length should be computed at copy_from_user()
time and I'd like to look at each such call site and understand why
it *doesn't* have the length ready. But that's a lot of work.

3) They do particularly crappy mixing. See that "RFC PATCH 4/2" I posted
that because I couldn't stand how bad it was.

If you don't have word at a time, the partial_name_hash() is decent,
but the word-at-a-time mixes by multiplying by 9. So the hashes
of the strings "1.......0" and "0.......9" are identical.

(I assume this was chosen as the best available one-instruction (LEA)
mix operation due to an obsessive interest in speed in the dcache.)

More crappy mixing is the final folding. On a 64-bit machine, the
high and low 32 bits are just XORed together. So the hashes of
"deadbeef" and "beefdead" are identical.

I agree we should be very careful of the mixing function, since it's
the only thing limiting loop cycle time. The has_zero hackery has a
cricital path about 6 cycles long, but they're independent per loop
and a sufficiently aggressive OOO machine could pipeline them.

(If you have a particular cycle count budget in mind, I can come up with
something.)

4) They don't do a final mix. Again, obsessive interest in speed when
you're storing the whole 32 bits and comparing that. For applications
that use only a few bits of hash index and need the entropy
"concentrated", should this be done outside?

Basicaly, I can understand why someone might prefer a stronger hash
for less speed-critical applications.

I agree that we should ideally have just two string hash functions for
general kernel use (plus any imposed by ecternal standards like file
systems or ELF).

One is the dcache hash, for applications where collision DoS attacks are
not expected and is optimized strongly for speed.

A second, something like SipHash, for applcations which require
sollision resistance against malicious attackers.

But achieving that ideal is a project of significant size.
There are a lot of corner cases to worry about.

2016-05-02 21:19:40

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

On Mon, May 2, 2016 at 1:26 PM, George Spelvin <[email protected]> wrote:
>
> But the hash_long it not odd at all. They're using it as the mix function
> between adding words. Like hash_name() uses "hash = (hash + a) * 9".

Right. But there is no reason to think that that should be the same
thing as the final hash.

In fact, there are many reasons to think it should *not*.

The final hash is different.

It's duifferent not just because it wants to concentrate the bits at
the highb end (which the middle hash does not), but it's different
exactly because the whole calling convention is different: the final
hash returns a "small" value (in your version an "u32"), while the
hash in the middle very much does not.

So thinking that they are somehow related is wrong. It screws up
hash.h, and incorrectly conflates this middle hash_mix() thing with
the final has_32/64() thing.

> It's a quite reasonable mix function. One multiply (4 cycles or so) per
> 8 bytes. It's definitely swamped by the byte-at-a-time string loading.

.. but it's not. Exactly because of the above.

Make it be a "hash_mix()" function. Make it use the multiply, by all
means. Same multiplier, even. BUT IT IS STILL NOT THE SAME FUNCTION,
for the above reason. One wants to return "u32", the other does not.

Really.

> But... the fundamental reason I didn't was that this is late -rc.
> I'm trying to fix a *bug* tglx found, not add risk and delay with a much
> more ambitious patch series.

Oh, this late in the rc we're not doing _any_ of this. I sent out my
suggested "late in the rc and for stable" patch that fixes the
practical problem we have, that has nothing to do with cleaning things
up.

> It *is* the FNV hash! More specifically, FNV-1a done a word at a time;
> doing it byte at a time like most implementations would result in 8
> times as many multiplies and be slower.

I refuse to call that shit "word at a time".

It's "byte at a time with a slow conditional that will screw up your
branch predictor and a multiply in the middle".

A compiler might be able to turn it into some disgusting unrolled
thing that avoids some of the problems, but at no point is that a good
thing.

I seriously think that it would be

(a) more readable

(b) have a better mix function

if it was just kept as a byte-at-a-time thing entirely with the
per-byte mixing thing done better than just "shift by 8 bits".

And then at the end you could do a single hash_long().

That horrible svc thing needs to go.

It's alll pretty moot, since we have a reasonable version that
actually does do work-at-a-time.

That can be improved too, I'm sure, but that svcauth garbage should
just be thrown out.

> The only difference is the exact multiplier. The published FNV-1
> uses a low-bit-weight multiplier. Since this is being done a word
> at a time, I think the stronger multiplier is worth it.

.. Considering that the "word-at-a-time" is likely *slower* than doing
it a byte-at-a-time the way it has been encoded, I'm not in the least
convinced.

> For example, partial_name_hash() is still used in many places
> even if the word-at-a-time code is used in namei.c.

Those places aren't actually all that performance-critical. They
really don't matter.

> Okay, let me list the problems...
>
> 1) hash_name() stops at a slash or a nul. hash_str() only looks
> for a nul. Should I add a third variant? Should that go in fs/namei,
> or should the whole pole be moved elsewhere?

We'll never get rid of "hash_name()", it not only has that '/' case,
it's also inlined for a reason. You'd copy it without the magic for
'/' and turn that into str_hash() for others to use.

full_name_hash() can presumably be used pretty much as-is as mem_hash().

> 2) Some places need the length *and* the hash. Calling strlen() and then
> full_name_hash() somewhat defeats the purpose of word-at-a-time access.
> hash_name returns both jammed into a 64-bit word. Is that a good
> interface in general?

Hmm. We actually have that exact case in the dentry code and
hash_name(), except we handle it by doing that "hashlen" thing that
contains both the length and the hash in one 64-bit thing.

Maybe we could expose that kind of interface, even if it's pretty ugly.

> Myself, I think the length should be computed at copy_from_user()
> time and I'd like to look at each such call site and understand why
> it *doesn't* have the length ready. But that's a lot of work.

If it's copied from user space, we already do have the length.

You do realize that pathnames are different from pretty much every
other case in the kernel? There's a reason pathnames have their own
logic. The string length is very different from the component length,
and it turns out that component length and hash is generally critical.

> 3) They do particularly crappy mixing. See that "RFC PATCH 4/2" I posted
> that because I couldn't stand how bad it was.
>
> If you don't have word at a time, the partial_name_hash() is decent,
> but the word-at-a-time mixes by multiplying by 9. So the hashes
> of the strings "1.......0" and "0.......9" are identical.
>
> (I assume this was chosen as the best available one-instruction (LEA)
> mix operation due to an obsessive interest in speed in the dcache.)

The thing is, a lot of people tend to optimize performance (and
behavior) for large strings.

For path components, the most common lengths are less than a single
8-byte word! That "mixing" function almost doesn't matter, because the
case that matters the most (by far) are strings that fit in one or
_maybe_ two words.

Yes, things are slowly moving towards longer pathnames, but even with
long filenames, most component names are the directory entries that
still tend to be shorter. We still have the old 3-letter ones close to
the root, of course, but the statistics are still pretty heavily
skewed to <15 characters especially if you look at directory names.

So for pathnames, the mixing in the middle has tended to not be
_nearly_ as important as the final mix, for the simple reason that it
happens maybe once, often not at all.

> More crappy mixing is the final folding. On a 64-bit machine, the
> high and low 32 bits are just XORed together. So the hashes of
> "deadbeef" and "beefdead" are identical.

Hmm? It uses "fold_hash()", which definitely doesn't do that.

Are you still looking at partial_name_hash() and friends? Don't. They
are garbage. They are used for things like case-insensitive
filesystems etc.

> (If you have a particular cycle count budget in mind, I can come up with
> something.)

The real issue I had in fs/namei.c is that link_path_walk() and
__d_lookup_rcu() are literally _the_ hottest functions in the kernel
under a few (common) loads, and for code generation things like
register pressure ends up mattering.

The reason that "hash_len" is a single 64-bit field rather than two
32-bit fields, for example, is that that way it takes on _register_
when we do the has lookup. Some of that code was tuned to inline - and
_not_ inline in particular patterns.

Now, some of that tuning may have bitrotted, of course, so I'm not
saying it's necessarily doing great now. But some of that code was
tuned to not need a stack frame etc.

> 4) They don't do a final mix. Again, obsessive interest in speed when
> you're storing the whole 32 bits and comparing that. For applications
> that use only a few bits of hash index and need the entropy
> "concentrated", should this be done outside?

I think the caller should do it, yes. There's a difference between
"calculate a hash for a string" and "turn that hash into a hashtable
lookup".

Linus

2016-05-02 21:41:11

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

On Mon, May 2, 2016 at 2:19 PM, Linus Torvalds
<[email protected]> wrote:
>
> The reason that "hash_len" is a single 64-bit field rather than two
> 32-bit fields, for example, is that that way it takes on _register_
> when we do the has lookup. Some of that code was tuned to inline - and
> _not_ inline in particular patterns.

Actually, I think the tuning for no stack frame etc was mostly for the
permission testing with the selinux AVC code.

The filename hashing does have some of it too - like making sure that
the call to ->d_hash() is in an unlikely path etc and doesn't pollute
the case that actually matters. But I don't think any of it is simple
enough to avoid a stack frame.

The hash_len thing did make the innermost hash lookup loop smaller,
which was noticeable at some point.

What I really wanted to do there was actually have a direct-mapped "L1
dentry hash cache", that didn't have a hash loop at all (it would fall
back to the slow case for that). I had a patch for that (which worked
*beautifully*, partly because it also moved the hot entries to that
hash cache and thus effectively moved the active entries to the head
of the queue), but I couldn't get the L1 cache update to be coherent
without locking, which killed the thing.

Anyway, I suspect that your mixing function changes should be fine.
That link_path_walk() is important, but a couple of shifts and xors
shouldn't kill it.

Linus

2016-05-03 01:59:04

by George Spelvin

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

> Right. But there is no reason to think that that should be the same
> thing as the final hash.

Your logic is absolutely correct. I agree with you. The two operations
have different purposes, and should be thought of differently.

HOWEVER, if you can find one operation which is good enough to
serve both purposes, that's not evidence of incompetence.

I'm not saying hash_str is anything wonderful. Just that it's not a
complete disaster. (As a hash function. Your comments about its
performance are addressed below.)

It's frankly a lot *better* than the hash computed by namei.c.
That one *is* perilously close to a complete disaster.

The main problem with the multiplicative hash (which describes both
hash_str() and hash_name()) is that input bits can only affect higher
state bits. The lsbit of the 64-bit state is just the XOR of all the
input lsbits. The lsbyte depends only on the lsbytes.

But that's not a disaster. In hash_str(), we throw away the lsbits
at the end so their imperfect mixing doesn't matter.

What's bad is that the msbits of the inputs can only affect the
msbits of the hash. The msbits of the inputs are just XORed together
into the msbit of the hash. It's fundamentally impossible for
such a hash to detect any 2-bit change to the msbits of the input.

That said, you're right that most names aren't long enough (16 bytes on
little-endian) to *have* a second msbit to worry about.

> I refuse to call that shit "word at a time".
>
> It's "byte at a time with a slow conditional that will screw up your
> branch predictor and a multiply in the middle".

The *hashing* is word at a time. The *fetching* is done byte at a time
and I agree that it doesn't qualify in any way at all.

But yeah, your point about the branch predictor penalty being worse than
the 7 saved multiplies is well taken.

Rule #1 of performane: Avoid lock contention at any cost.
Rule #2 of performane: Avoid cache misses unless it conflicts with #1.
(Rule 2a: Cache bouncing is a paritucularly permicious form of cache missing.)
(Rule 2b: Locking, even uncontended, tends to cause cache bouncing.)
Rule #3 of performance: Avoid unpredictable branches.

>> For example, partial_name_hash() is still used in many places
>> even if the word-at-a-time code is used in namei.c.

> Those places aren't actually all that performance-critical. They
> really don't matter.

AFAICT, none of the string-hash functions outside of fs/ are
on particularly hot paths. The goal is deleting redundant code.

> We'll never get rid of "hash_name()", it not only has that '/' case,
> it's also inlined for a reason. You'd copy it without the magic for
> '/' and turn that into str_hash() for others to use.
>
> full_name_hash() can presumably be used pretty much as-is as mem_hash().

That part is obvious. I was just caught between two unpleasant
possibiites:

- Placing a str_hash() helper function in the middle of fs/namei.c which
nothing in fs/namei.c actually calls, or
- Copying it to some outside file and then having to keep the
two in sync.

Thinking about the function comments on each, the first seems less
embarrasing to write.

> Maybe we could expose that kind of interface, even if it's pretty ugly.

Yeah, that's pretty much what I thought. I just hoped you had some
brilliant idea for avoiding it.

> The thing is, a lot of people tend to optimize performance (and
> behavior) for large strings.
>
> For path components, the most common lengths are less than a single
> 8-byte word! That "mixing" function almost doesn't matter, because the
> case that matters the most (by far) are strings that fit in one or
> _maybe_ two words.

I'll remember that next time I look up
.git/objects/69/466b786e99a0a2d86f0cb99e0f4bb61588d13c

:-)

But yes, it makes your point about pathname components.
The first three are all a single word.

I just worry with people having directories full of PHP state
cookies.

> Hmm? It uses "fold_hash()", which definitely doesn't do that.

Mea culpa; that's a side effect of repeately grepping the kernel.
There was a *different* hash folding function in some other source file
elsewhere that did that, and I got the two mixed up in my memory.

The word-at-a-time one does "hash_64(x)", which doesn't have that problem.

(Maybe it was the IP checksum folding?)

>> (If you have a particular cycle count budget in mind, I can come up with
>> something.)

> The real issue I had in fs/namei.c is that link_path_walk() and
> __d_lookup_rcu() are literally _the_ hottest functions in the kernel
> under a few (common) loads, and for code generation things like
> register pressure ends up mattering.
>
> The reason that "hash_len" is a single 64-bit field rather than two
> 32-bit fields, for example, is that that way it takes on _register_
> when we do the has lookup. Some of that code was tuned to inline - and
> _not_ inline in particular patterns.
>
> Now, some of that tuning may have bitrotted, of course, so I'm not
> saying it's necessarily doing great now. But some of that code was
> tuned to not need a stack frame etc.

Yes, and it's hard to make a general purpose helper out of code
that's tuned to piano wire tension like that.

I was thinking about very fast hash functions (2 or 3 cycles
per word), and the obvious solution is to use a second register.

As you pointed out above, the mixing function can take advantage of an
internal state which is larger than the final state, so the easy way to
minimize collisions without very expensive state mixing between words
is to give the bits more room to spread out.

I was playing with the idea of an ARX structure like the "Speck" block
cipher (https://en.wikipedia.org/wiki/Speck_%28cipher%29):

h1 ^= a;
h2 ^= h1; h1 = ROTL(h1, K);
h1 += h2; h2 *= 9;

The "h2 *= 9" replaces "ROTL(h2, 3)" in Speck, achieves a little more
mixing, is one cycle on most machines, and is probably supported by
more functional units than a general barrel shift.

It's only one more cycle on the critical path than the current
"hash = (hash + a) * 9"... but it's one more register.

> I think the caller should do it, yes. There's a difference between
> "calculate a hash for a string" and "turn that hash into a hashtable
> lookup".

Makes sense, thanks.

Another thing worth doing is having a slightly less thorough folding
function than hash64(x, 32). (x >> 32) + (uint32)x * CONSTANT does
just as well if you're keeping all 32 bits.

This is hasically the first half of my 32-bit hash_64(). Then if you
want less than 32 bits, you can do a second hash_32() on the result.

> Anyway, I suspect that your mixing function changes should be fine.
> That link_path_walk() is important, but a couple of shifts and xors
> shouldn't kill it.

Actually, that code
1) Uses an extra register for a shift temporary anyway, and
2) Has a 6-cycle critical path, which is pretty borderline.

The ARX code above seems like a more efficient use of two registers.

(It's also conveniently nonlinear, so if we want, feeding a salt into
h2 makes it less utterly trivial to force collisions.)

I'll play with that for a bit. Thank you for mentioning those critical
functions; I'll check the x86 code generation for them to make sure it
doesn't get worse.

P.S. Here's a way to improve partial_name_hash on x86.
Compare the assembly for

unsigned long
pnh1(unsigned long c, unsigned long prevhash)
{
return (prevhash + (c << 4) + (c >> 4)) * 11;
}

pnh1:
movl %eax, %ecx
shrl $4, %eax
sall $4, %ecx
addl %ecx, %eax
addl %eax, %edx
leal (%edx,%edx,4), %eax
leal (%edx,%eax,2), %eax
ret

unsigned long
pnh2(unsigned long c, unsigned long prevhash)
{
prevhash += c <<= 4;
prevhash += c >> 8;
return prevhash * 11;
}

pnh2:
sall $4, %eax
addl %eax, %edx
shrl $8, %eax
addl %eax, %edx
leal (%edx,%edx,4), %eax
leal (%edx,%eax,2), %eax
ret

pnh1 doesn't know that "c" is really only 8 bits and so it doesn't need
to copy it to another register to compute the two shifted forms.

2016-05-03 03:01:45

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH 1/2] <linux/hash.h>: Make hash_64(), hash_ptr() return 32 bits

On Mon, May 2, 2016 at 6:59 PM, George Spelvin <[email protected]> wrote:
>
> AFAICT, none of the string-hash functions outside of fs/ are
> on particularly hot paths. The goal is deleting redundant code.

Yes, agreed.

>> We'll never get rid of "hash_name()", it not only has that '/' case,
>> it's also inlined for a reason. You'd copy it without the magic for
>> '/' and turn that into str_hash() for others to use.
>>
>> full_name_hash() can presumably be used pretty much as-is as mem_hash().
>
> That part is obvious. I was just caught between two unpleasant
> possibiites:
>
> - Placing a str_hash() helper function in the middle of fs/namei.c which
> nothing in fs/namei.c actually calls, or
> - Copying it to some outside file and then having to keep the
> two in sync.

So I don't think the "keep the two in sync" is necessarily all that problematic.

The word-at-a-time logic _used_ to be very specific to the name_hash()
code, but it got made generic enough over a few iterations that it's
actually a fairly reasonable pattern now.

So the code loop ends up being really not very complex:

const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

hash = a = 0;
len = -sizeof(unsigned long);
do {
hash = (hash + a) * 9;
len += sizeof(unsigned long);
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b,
&bdata, &constants)));

and with your suggested "hash_mix()" function (may I suggest we just
make it take both the old hash and the new value as two separate
arguments, and it can choose how to mix them), there remains pretty
much nothing to keep in sync.

Yes, there's the tail part, but that ends up being pretty simple too.

The non-path-component case (so checking only for an ending NUL
character, not the '/') ends up being exactly the same, except all the
'b' logic goes away, so you end up with

hash = a = 0;
len = -sizeof(unsigned long);
do {
hash = hash_mix(hash, a);
len += sizeof(unsigned long);
a = load_unaligned_zeropad(name+len);
} while (!has_zero(a, &adata, &constants));

which really seems to not be a pain. Very simple, in fact. There's
hardly any code left to keep in sync: any tweaking of the hash would
happen by tweaking the hash_mix()

The tail part would presumably be:

adata = prep_zero_mask(a, adata, &constants);

mask = create_zero_mask(adata | bdata);
return hash_mix(hash, a & zero_bytemask(mask));

and you're done (or we could perhaps decide that the last mix is fine
just doing a single add? We will have mixed up all previous hash
values, so that last "hash_mix()" might just be a simple addition).

Yes, if we want to return some mix of the length and the hash, we'd
have to play those hashlen games, but I suspect the only case that
*really* cares deeply about that is the dentry hash case, and we'd
just keep it separate.

In other words, I think just the addition of your "hash_mix()" helper
is enough to abstract things out enough that there really is nothing
left but tying all those pieces together, and no maintenance burden
from having "two copies" of that tying-togetherness.

>> For path components, the most common lengths are less than a single
>> 8-byte word! That "mixing" function almost doesn't matter, because the
>> case that matters the most (by far) are strings that fit in one or
>> _maybe_ two words.
>
> I'll remember that next time I look up
> .git/objects/69/466b786e99a0a2d86f0cb99e0f4bb61588d13c
>
> :-)

Yes, they happen, but when people have pathnames like that, your
hashing function generally isn't going to much matter.

Except you absolutely want to avoid 8-bit and 4-bit boundaries when
mixing. The "*9" we have now does that, we had a *11 in an earlier
incarnation (but that was coupled with shifting right too - I think
our old hash remains in the partial_name_hash())

I do agree that it's not a great hash mixing function, but I don't
think it's been particularly problematic either. I did run my whole
filesystem through the hash at some point just to verify, and the
statistics seemed fairly balanced.

But yes, I think your hash_mix() function is likely a noticeable
improvement from a hashing standpoint. And while it may not be all
that noticeable for path components that are usually pretty short, if
we extend this code to be a generic string hash then the whole "three
characters is the most common component length" argument gores away ;)

> I was playing with the idea of an ARX structure like the "Speck" block
> cipher (https://en.wikipedia.org/wiki/Speck_%28cipher%29):
>
> h1 ^= a;
> h2 ^= h1; h1 = ROTL(h1, K);
> h1 += h2; h2 *= 9;
>
> The "h2 *= 9" replaces "ROTL(h2, 3)" in Speck, achieves a little more
> mixing, is one cycle on most machines, and is probably supported by
> more functional units than a general barrel shift.
>
> It's only one more cycle on the critical path than the current
> "hash = (hash + a) * 9"... but it's one more register.

Try it.

I wouldn't worry too much about 32-bit x86 any more (we want ti to not
suck horribly, but it's not the primary target for anybody who cares
about best performance) any more. But x86-64 code generation is worth
looking at. The register pressure issue is still real, but it's not
quite as bad as the old 32-bit code.

The other main architectures that it would be good to verify are ok
are ARMv8 and powerpc.

> P.S. Here's a way to improve partial_name_hash on x86.
> Compare the assembly for
>
> unsigned long
> pnh1(unsigned long c, unsigned long prevhash)
> {
> return (prevhash + (c << 4) + (c >> 4)) * 11;
> }
>
> pnh1:
> movl %eax, %ecx
> shrl $4, %eax
> sall $4, %ecx
> addl %ecx, %eax
> addl %eax, %edx
> leal (%edx,%edx,4), %eax
> leal (%edx,%eax,2), %eax
> ret
>
> unsigned long
> pnh2(unsigned long c, unsigned long prevhash)
> {
> prevhash += c <<= 4;
> prevhash += c >> 8;
> return prevhash * 11;
> }
>
> pnh2:
> sall $4, %eax
> addl %eax, %edx
> shrl $8, %eax
> addl %eax, %edx
> leal (%edx,%edx,4), %eax
> leal (%edx,%eax,2), %eax
> ret
>
> pnh1 doesn't know that "c" is really only 8 bits and so it doesn't need
> to copy it to another register to compute the two shifted forms.

Well, if I cared about the partial_name_hash() (which I don't), I'd
suggest you just convince the compiler to generate

rolb $4,%al

instead of two shifts, and just be done with it.

You might find that you end up with partial register write stalls,
though, so you might have to add a "movzbq %al,%rax" to get around
those.

In fact, it looks like you can get gcc to generate that code:

unsigned long pnh3(unsigned char c, unsigned long prevhash)
{
c = (c << 4) | (c >> 4);
return (prevhash + c)*11;
}

generates

pnh3:
rolb $4, %dil
movzbl %dil, %edi
addq %rdi, %rsi
leaq (%rsi,%rsi,4), %rax
leaq (%rsi,%rax,2), %rax
ret

which is one instruction less than your pnh2, but should perform even
better because I think "movzbl" ends up being done as a rename and
mask in the microarchitecture - without any actual ALU costs or added
latency.

But I suspect it can be a bit fragile to get gcc to actually generate
that rotate instruction. I could easily see that being inlined, and
than the pattern that turns into a rotate instruction gets perturbed
enough that gcc no longer generates the rot..

Linus