LinuxLists.cc - [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

2011-08-08 23:08:08

Subject: [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

The previous implementation required the workspace to be passed in as
a parameter. This prevents the compiler from being able to store the
workspace in registers. I've also removed the memset since that also
prevents the compiler from storing the workspace in registers.

There is no loss of security due to removing the memset. It would be a
bug for the stack to leak to userspace. However, a defence-in-depth
argument could be made for keeping the clearing of the workspace.

For ChromiumOS, we use SHA-1 to verify the integrity of the root
filesystem. The speed of the kernel sha-1 implementation has a major
impact on our boot performance.

This patch results in a 190 ms improvement in boot speed.

10 reboots, remove slowest/fastest.

Before:

Mean: 5.89 seconds
Stdev: 0.07

After:

Mean: 5.70 seconds
Stdev: 0.04

Signed-off-by: Mandeep Singh Baines <[email protected]>
Suggested-by: Linus Torvalds <[email protected]>
Link: http://thread.gmane.org/gmane.linux.kernel.cryptoapi/5758/focus=5769
Cc: Ramsay Jones <[email protected]>
Cc: Nicolas Pitre <[email protected]>
Cc: Joachim Eastwood <[email protected]>
Cc: Andreas Schwab <[email protected]>
Cc: Herbert Xu <[email protected]>
Cc: David S. Miller <[email protected]>
Cc: [email protected]
Cc: Linus Torvalds <[email protected]>
---
crypto/sha1_generic.c | 4 ----
drivers/char/random.c | 7 +++----
include/linux/cryptohash.h | 3 +--
lib/sha1.c | 8 ++------
net/ipv4/syncookies.c | 5 ++---
net/ipv4/tcp_output.c | 4 +---
net/ipv6/syncookies.c | 5 ++---
7 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c
index 00ae60e..e6dd074 100644
--- a/crypto/sha1_generic.c
+++ b/crypto/sha1_generic.c
@@ -49,8 +49,6 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
src = data;

if ((partial + len) >= SHA1_BLOCK_SIZE) {
- u32 temp[SHA_WORKSPACE_WORDS];
-
if (partial) {
done = -partial;
memcpy(sctx->buffer + partial, data,
@@ -63,8 +61,6 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
done += SHA1_BLOCK_SIZE;
src = data + done;
} while (done + SHA1_BLOCK_SIZE <= len);
-
- memset(temp, 0, sizeof(temp));
partial = 0;
}
memcpy(sctx->buffer + partial, src, len - done);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index c35a785..88a30f4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -816,13 +816,13 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
static void extract_buf(struct entropy_store *r, __u8 *out)
{
int i;
- __u32 hash[5], workspace[SHA_WORKSPACE_WORDS];
+ __u32 hash[5];
__u8 extract[64];

/* Generate a hash across the pool, 16 words (512 bits) at a time */
sha_init(hash);
for (i = 0; i < r->poolinfo->poolwords; i += 16)
- sha_transform(hash, (__u8 *)(r->pool + i), workspace);
+ sha_transform(hash, (__u8 *)(r->pool + i));

/*
* We mix the hash back into the pool to prevent backtracking
@@ -839,9 +839,8 @@ static void extract_buf(struct entropy_store *r, __u8 *out)
* To avoid duplicates, we atomically extract a portion of the
* pool while mixing, and hash one final time.
*/
- sha_transform(hash, extract, workspace);
+ sha_transform(hash, extract);
memset(extract, 0, sizeof(extract));
- memset(workspace, 0, sizeof(workspace));

/*
* In case the hash function has some recognizable output
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 2cd9f1c..2c17ded 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -3,10 +3,9 @@

#define SHA_DIGEST_WORDS 5
#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
-#define SHA_WORKSPACE_WORDS 16

void sha_init(__u32 *buf);
-void sha_transform(__u32 *digest, const char *data, __u32 *W);
+void sha_transform(__u32 *digest, const char *data);

#define MD5_DIGEST_WORDS 4
#define MD5_MESSAGE_BYTES 64
diff --git a/lib/sha1.c b/lib/sha1.c
index f33271d..166d9f2 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -66,20 +66,16 @@
*
* @digest: 160 bit digest to update
* @data: 512 bits of data to hash
- * @array: 16 words of workspace (see note)
*
* This function generates a SHA1 digest for a single 512-bit block.
* Be warned, it does not handle padding and message digest, do not
* confuse it with the full FIPS 180-1 digest algorithm for variable
* length messages.
- *
- * Note: If the hash is security sensitive, the caller should be sure
- * to clear the workspace. This is left to the caller to avoid
- * unnecessary clears between chained hashing operations.
*/
-void sha_transform(__u32 *digest, const char *data, __u32 *array)
+void sha_transform(__u32 *digest, const char *data)
{
__u32 A, B, C, D, E;
+ __u32 array[16];

A = digest[0];
B = digest[1];
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 92bb943..6cdead4 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,8 +37,7 @@ __initcall(init_syncookies);
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)

-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5], ipv4_cookie_scratch);

static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
@@ -50,7 +49,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
tmp[1] = (__force u32)daddr;
tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+ sha_transform(tmp + 16, (__u8 *)tmp);

return tmp[17];
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..db1932c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2494,7 +2494,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
}

if (opts.hash_size > 0) {
- __u32 workspace[SHA_WORKSPACE_WORDS];
u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];

@@ -2511,8 +2510,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */

sha_transform((__u32 *)&xvp->cookie_bakery[0],
- (char *)mess,
- &workspace[0]);
+ (char *)mess);
opts.hash_location =
(__u8 *)&xvp->cookie_bakery[0];
}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 89d5bf8..c22ab16 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -63,8 +63,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
return child;
}

-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5], ipv6_cookie_scratch);

static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
@@ -81,7 +80,7 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
memcpy(tmp + 4, daddr, 16);
tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
tmp[9] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+ sha_transform(tmp + 16, (__u8 *)tmp);

return tmp[17];
}
--
1.7.3.1

2011-08-08 23:45:19

by Joe Perches

[permalink] [raw]

Subject: Re: [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

On Mon, 2011-08-08 at 16:07 -0700, Mandeep Singh Baines wrote:
> The previous implementation required the workspace to be passed in as
> a parameter. This prevents the compiler from being able to store the
> workspace in registers. I've also removed the memset since that also
> prevents the compiler from storing the workspace in registers.

I did a similar patch locally.

> There is no loss of security due to removing the memset. It would be a
> bug for the stack to leak to userspace. However, a defence-in-depth
> argument could be made for keeping the clearing of the workspace.

You should add #include <linux/crypthash.h> to
lib/sha1.c and perhaps rationalize the use of __u8
and char for the second argument to sha_transform in
the definition and uses.

For defense in depth, a bool could be added to sha_transform
like:

void sha_transform(__u32 *digest, const char *data, bool wipe);

and the internal workspace memset after use if wipe set though
perhaps the memset could be a compile time option like
CONFIG_CRYPTO_DEFENSE_IN_DEPTH or such instead.

> diff --git a/drivers/char/random.c b/drivers/char/random.c
[]
> @@ -816,13 +816,13 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
> static void extract_buf(struct entropy_store *r, __u8 *out)
[]
> - sha_transform(hash, (__u8 *)(r->pool + i), workspace);
> + sha_transform(hash, (__u8 *)(r->pool + i));
[]
> diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
[]
> @@ -3,10 +3,9 @@
[]
> -void sha_transform(__u32 *digest, const char *data, __u32 *W);
> +void sha_transform(__u32 *digest, const char *data);
[]
> diff --git a/lib/sha1.c b/lib/sha1.c
[]
> +void sha_transform(__u32 *digest, const char *data)
[]
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
[]
> @@ -50,7 +49,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
[]
> - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
> + sha_transform(tmp + 16, (__u8 *)tmp);
[]
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
[]
> @@ -2511,8 +2510,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
[]
> sha_transform((__u32 *)&xvp->cookie_bakery[0],
> - (char *)mess,
> - &workspace[0]);
> + (char *)mess);
[]
> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
[]
> @@ -81,7 +80,7 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
[]
> - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
> + sha_transform(tmp + 16, (__u8 *)tmp);

2011-08-08 23:51:19

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

On Mon, Aug 8, 2011 at 4:07 PM, Mandeep Singh Baines <[email protected]> wrote:
>
> There is no loss of security due to removing the memset. It would be a
> bug for the stack to leak to userspace. However, a defence-in-depth
> argument could be made for keeping the clearing of the workspace.

So I'm nervous about this just because I can see the security crazies
rising up about this.

The fact is, in our current code in drivers/char/random.c, we do have
a memset() of the workspace buffer on the stack, and any competent
compiler should actually just remove it, because it's dead memory (and
the compiler can *see* that it's dead memory). Of course, I don't know
if gcc does notice that, but it's a prime example of code that "looks"
secure, but has absolutely zero actual real security. Getting rid of
the memset() is actually better for *real* security, in that at least
it's not some kind of pointless security theater. But I can see some
people wanting to add a memory barrier or something to force the
memset() to actually take place.

So I dunno.

Arguably it's theoretically possible to find random data on the stack,
and maybe it can even be interesting (although I don't think the last
64 bytes of SHA1 state is all that exciting myself). Personally, I
consider it unlikely as hell to be relevant to anybody, and anybody
who has access to the kernel stack has *much* more direct security
holes than some random data that they can use. But the patch still
makes me worry about the brouhaha from some people.

Linus

2011-08-09 05:52:27

by Mandeep Singh Baines

[permalink] [raw]

Subject: Re: [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

Joe Perches ([email protected]) wrote:
> On Mon, 2011-08-08 at 16:07 -0700, Mandeep Singh Baines wrote:
> > The previous implementation required the workspace to be passed in as
> > a parameter. This prevents the compiler from being able to store the
> > workspace in registers. I've also removed the memset since that also
> > prevents the compiler from storing the workspace in registers.
>
> I did a similar patch locally.
>
> > There is no loss of security due to removing the memset. It would be a
> > bug for the stack to leak to userspace. However, a defence-in-depth
> > argument could be made for keeping the clearing of the workspace.
>
> You should add #include <linux/crypthash.h> to
> lib/sha1.c and perhaps rationalize the use of __u8
> and char for the second argument to sha_transform in
> the definition and uses.
>
> For defense in depth, a bool could be added to sha_transform
> like:
>
> void sha_transform(__u32 *digest, const char *data, bool wipe);
>

This seems reasonable to me. In our case, there is no secrecy
issue. We hash the blocks of the RO root filesystem. There is nothing
secret about the rootfs blocks or their hash. We use the hashes to verify
that the blocks haven't been modified by an attacker.

We don't call sha_tranform directly. We use crypto_hash_digest. So maybe
add a wipe param there. I'm happy to work on or test such a patch if folks
think its interesting. Its saves me 190 ms on a 6 second boot. I suspect
there may be other hash intense applications that also don't need secracy.

> and the internal workspace memset after use if wipe set though
> perhaps the memset could be a compile time option like
> CONFIG_CRYPTO_DEFENSE_IN_DEPTH or such instead.
>

Maybe instead use wipe=0 in code paths where you don't need secracy.

> > diff --git a/drivers/char/random.c b/drivers/char/random.c
> []
> > @@ -816,13 +816,13 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
> > static void extract_buf(struct entropy_store *r, __u8 *out)
> []
> > - sha_transform(hash, (__u8 *)(r->pool + i), workspace);
> > + sha_transform(hash, (__u8 *)(r->pool + i));
> []
> > diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
> []
> > @@ -3,10 +3,9 @@
> []
> > -void sha_transform(__u32 *digest, const char *data, __u32 *W);
> > +void sha_transform(__u32 *digest, const char *data);
> []
> > diff --git a/lib/sha1.c b/lib/sha1.c
> []
> > +void sha_transform(__u32 *digest, const char *data)
> []
> > diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> []
> > @@ -50,7 +49,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
> []
> > - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
> > + sha_transform(tmp + 16, (__u8 *)tmp);
> []
> > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> []
> > @@ -2511,8 +2510,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
> []
> > sha_transform((__u32 *)&xvp->cookie_bakery[0],
> > - (char *)mess,
> > - &workspace[0]);
> > + (char *)mess);
> []
> > diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> []
> > @@ -81,7 +80,7 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
> []
> > - sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
> > + sha_transform(tmp + 16, (__u8 *)tmp);
>
>

2011-08-09 07:01:25

by Geert Uytterhoeven

[permalink] [raw]

Subject: Re: [PATCH] lib/sha1: remove memsets and allocate workspace on the stack

On Tue, Aug 9, 2011 at 01:45, Joe Perches <[email protected]> wrote:
> You should add #include <linux/crypthash.h> to
> lib/sha1.c and perhaps rationalize the use of __u8
> and char for the second argument to sha_transform in
> the definition and uses.
>
> For defense in depth, a bool could be added to sha_transform
> like:
>
> void sha_transform(__u32 *digest, const char *data, bool wipe);

Yeah, why is the second parameter not "const void *data"? That
would save some casts in the callers.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

2011-08-09 08:58:26

by Joe Perches

[permalink] [raw]

Subject: [PATCH] treewide: Update sha_transform

Move the workspace into sha_transform as local stack variable struct.

Remove #define SHA_WORKSPACE_WORDS.
Remove workspace argument from sha_transform.
Convert uses of __u8 * to void * in sha_transform.
Eliminate possible sha_transform unaligned accesses to data by copying
data to an aligned __u32 array if necessary.
Add sha_transform wipe argument to force workspace clearing if desired.
A little macro neatening.

This should speed network syncookies a trivial bit.

Add #include <linux/cryptohash.h> to lib/sha1.c

Compiled/untested.

Signed-off-by: Joe Perches <[email protected]>
---

On Mon, 2011-08-08 at 22:52 -0700, Mandeep Singh Baines wrote:
> We don't call sha_tranform directly. We use crypto_hash_digest. So maybe
> add a wipe param there. I'm happy to work on or test such a patch if folks
> think its interesting. Its saves me 190 ms on a 6 second boot. I suspect
> there may be other hash intense applications that also don't need secracy.

Well, here's the patch I produced.

crypto/sha1_generic.c | 5 +---
drivers/char/random.c | 7 ++---
include/linux/cryptohash.h | 3 +-
lib/sha1.c | 61 +++++++++++++++++++++++++++++++-------------
net/ipv4/syncookies.c | 5 +--
net/ipv4/tcp_output.c | 6 +---
net/ipv6/syncookies.c | 5 +--
7 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c
index 00ae60e..d0c3f4a 100644
--- a/crypto/sha1_generic.c
+++ b/crypto/sha1_generic.c
@@ -49,8 +49,6 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
src = data;

if ((partial + len) >= SHA1_BLOCK_SIZE) {
- u32 temp[SHA_WORKSPACE_WORDS];
-
if (partial) {
done = -partial;
memcpy(sctx->buffer + partial, data,
@@ -59,12 +57,11 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
}

do {
- sha_transform(sctx->state, src, temp);
+ sha_transform(sctx->state, src, true);
done += SHA1_BLOCK_SIZE;
src = data + done;
} while (done + SHA1_BLOCK_SIZE <= len);

- memset(temp, 0, sizeof(temp));
partial = 0;
}
memcpy(sctx->buffer + partial, src, len - done);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index c35a785..6b9e5dc 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -816,13 +816,13 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
static void extract_buf(struct entropy_store *r, __u8 *out)
{
int i;
- __u32 hash[5], workspace[SHA_WORKSPACE_WORDS];
+ __u32 hash[5];
__u8 extract[64];

/* Generate a hash across the pool, 16 words (512 bits) at a time */
sha_init(hash);
for (i = 0; i < r->poolinfo->poolwords; i += 16)
- sha_transform(hash, (__u8 *)(r->pool + i), workspace);
+ sha_transform(hash, r->pool + i, false);

/*
* We mix the hash back into the pool to prevent backtracking
@@ -839,9 +839,8 @@ static void extract_buf(struct entropy_store *r, __u8 *out)
* To avoid duplicates, we atomically extract a portion of the
* pool while mixing, and hash one final time.
*/
- sha_transform(hash, extract, workspace);
+ sha_transform(hash, extract, true);
memset(extract, 0, sizeof(extract));
- memset(workspace, 0, sizeof(workspace));

/*
* In case the hash function has some recognizable output
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 2cd9f1c..c64b5cf 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -3,10 +3,9 @@

#define SHA_DIGEST_WORDS 5
#define SHA_MESSAGE_BYTES (512 /*bits*/ / 8)
-#define SHA_WORKSPACE_WORDS 16

void sha_init(__u32 *buf);
-void sha_transform(__u32 *digest, const char *data, __u32 *W);
+void sha_transform(__u32 *digest, const void *data, bool wipe);

#define MD5_DIGEST_WORDS 4
#define MD5_MESSAGE_BYTES 64
diff --git a/lib/sha1.c b/lib/sha1.c
index f33271d..a78ca29 100644
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bitops.h>
+#include <linux/cryptohash.h>
#include <asm/unaligned.h>

/*
@@ -41,45 +42,66 @@
#endif

/* This "rolls" over the 512-bit array */
-#define W(x) (array[(x)&15])
+#define W(x) (workspace.array[(x)&15])

/*
* Where do we get the source from? The first 16 iterations get it from
* the input data, the next mix it from the 512-bit array.
*/
-#define SHA_SRC(t) get_unaligned_be32((__u32 *)data + t)
+#define SHA_SRC(t) (workspace.aligned_data[t])
#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)

-#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
- __u32 TEMP = input(t); setW(t, TEMP); \
- E += TEMP + rol32(A,5) + (fn) + (constant); \
- B = ror32(B, 2); } while (0)
-
-#define T_0_15(t, A, B, C, D, E) SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
-#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 0x5a827999, A, B, C, D, E )
-#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, A, B, C, D, E )
-#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 0x8f1bbcdc, A, B, C, D, E )
-#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0xca62c1d6, A, B, C, D, E )
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) \
+do { \
+ __u32 TEMP = input(t); \
+ \
+ setW(t, TEMP); \
+ E += TEMP + rol32(A, 5) + (fn) + (constant); \
+ B = ror32(B, 2); \
+} while (0)
+
+#define T_0_15(t, A, B, C, D, E) \
+ SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D), 0x5a827999, A, B, C, D, E)
+#define T_16_19(t, A, B, C, D, E) \
+ SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D), 0x5a827999, A, B, C, D, E)
+#define T_20_39(t, A, B, C, D, E) \
+ SHA_ROUND(t, SHA_MIX, (B^C^D), 0x6ed9eba1, A, B, C, D, E)
+#define T_40_59(t, A, B, C, D, E) \
+ SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))), 0x8f1bbcdc, A, B, C, D, E)
+#define T_60_79(t, A, B, C, D, E) \
+ SHA_ROUND(t, SHA_MIX, (B^C^D), 0xca62c1d6, A, B, C, D, E)

/**
* sha_transform - single block SHA1 transform
*
* @digest: 160 bit digest to update
* @data: 512 bits of data to hash
- * @array: 16 words of workspace (see note)
+ * @wipe: true if the hash is security sensitive
*
* This function generates a SHA1 digest for a single 512-bit block.
* Be warned, it does not handle padding and message digest, do not
* confuse it with the full FIPS 180-1 digest algorithm for variable
* length messages.
- *
- * Note: If the hash is security sensitive, the caller should be sure
- * to clear the workspace. This is left to the caller to avoid
- * unnecessary clears between chained hashing operations.
*/
-void sha_transform(__u32 *digest, const char *data, __u32 *array)
+void sha_transform(__u32 *digest, const void *data, bool wipe)
{
__u32 A, B, C, D, E;
+ struct {
+ __u32 array[16]; /* working array */
+ __u32 aligned[16]; /* u32 aligned version of data */
+ const __u32 *aligned_data; /* either data or aligned */
+ } workspace;
+ size_t wipe_size;
+
+ if (((unsigned long)data) & 3) { /* unaligned word accesses */
+ workspace.aligned_data =
+ memcpy(workspace.aligned, data,
+ sizeof(workspace.aligned));
+ wipe_size = sizeof(workspace);
+ } else {
+ workspace.aligned_data = data;
+ wipe_size = sizeof(workspace.array);
+ }

A = digest[0];
B = digest[1];
@@ -182,6 +204,9 @@ void sha_transform(__u32 *digest, const char *data, __u32 *array)
digest[2] += C;
digest[3] += D;
digest[4] += E;
+
+ if (wipe)
+ memset(&workspace, 0, wipe_size);
}
EXPORT_SYMBOL(sha_transform);

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 92bb943..8f429cd 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -37,8 +37,7 @@ __initcall(init_syncookies);
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)

-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5], ipv4_cookie_scratch);

static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
@@ -50,7 +49,7 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
tmp[1] = (__force u32)daddr;
tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
tmp[3] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+ sha_transform(tmp + 16, tmp, false);

return tmp[17];
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..454ed67 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2494,7 +2494,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
}

if (opts.hash_size > 0) {
- __u32 workspace[SHA_WORKSPACE_WORDS];
u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];

@@ -2510,9 +2509,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */

- sha_transform((__u32 *)&xvp->cookie_bakery[0],
- (char *)mess,
- &workspace[0]);
+ sha_transform((__u32 *)&xvp->cookie_bakery[0], mess,
+ false);
opts.hash_location =
(__u8 *)&xvp->cookie_bakery[0];
}
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 89d5bf8..90823e0 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -63,8 +63,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
return child;
}

-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5], ipv6_cookie_scratch);

static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
@@ -81,7 +80,7 @@ static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *dadd
memcpy(tmp + 4, daddr, 16);
tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
tmp[9] = count;
- sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+ sha_transform(tmp + 16, tmp, false);

return tmp[17];
}
--
1.7.6.405.gc1be0

2011-08-09 15:41:51

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] treewide: Update sha_transform

On Tue, Aug 9, 2011 at 1:58 AM, Joe Perches <[email protected]> wrote:
>
> Eliminate possible sha_transform unaligned accesses to data by copying
> data to an aligned __u32 array if necessary.

This is wrong. Not only does it double the stack space, when I tried
it for git it just made things slower. So don't do it.

If some architecture has a bad "get_unaligned_be32()", then that's
just something for the arch maintainers to fix.

> Add sha_transform wipe argument to force workspace clearing if desired.

I disagree. As already mentioned, any competent compiler would make
that thing go away anyway. And there is no reason to believe that it's
actually a real fix for any real security issue, so it's voodoo
programming for several reasons.

We shouldn't do voodoo stuff. Or rather, I'm perfectly ok if you guys
all do your little wax figures of me in the privacy of your own homes
- freedom of religion and all that - but please don't do it in the
kernel.

Linus

2011-08-10 00:57:31

by Joe Perches

[permalink] [raw]

Subject: Re: [PATCH] treewide: Update sha_transform

On Tue, 2011-08-09 at 08:41 -0700, Linus Torvalds wrote:
> On Tue, Aug 9, 2011 at 1:58 AM, Joe Perches <[email protected]> wrote:
> > Eliminate possible sha_transform unaligned accesses to data by copying
> > data to an aligned __u32 array if necessary.
> This is wrong. Not only does it double the stack space, when I tried
> it for git it just made things slower. So don't do it.

Maybe the aligned copy should be in an
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS block.

Maybe Mandeep could test aligned/unaligned speeds
for his board?