2020-10-19 15:31:45

by Arvind Sankar

[permalink] [raw]
Subject: [PATCH 0/5] crypto: lib/sha256 - cleanup/optimization

Patch 1 -- Use memzero_explicit() instead of structure assignment/plain
memset() to clear sensitive state.

Patch 2 -- I am not sure about this one: currently the temporary
variables used in the generic sha256 implementation are cleared, but the
clearing is optimized away due to lack of compiler barriers. I don't
think it's really necessary to clear them, but I'm not a cryptanalyst,
so I would like comment on whether it's indeed safe not to, or we should
instead add the required barriers to force clearing.

The last three patches are optimizations for generic sha256.

Arvind Sankar (5):
crypto: Use memzero_explicit() for clearing state
crypto: lib/sha256 - Don't clear temporary variables
crypto: lib/sha256 - Clear W[] in sha256_update() instead of
sha256_transform()
crypto: lib/sha256 - Unroll SHA256 loop 8 times intead of 64
crypto: lib/sha256 - Unroll LOAD and BLEND loops

include/crypto/sha1_base.h | 3 +-
include/crypto/sha256_base.h | 3 +-
include/crypto/sha512_base.h | 3 +-
include/crypto/sm3_base.h | 3 +-
lib/crypto/sha256.c | 202 ++++++++++-------------------------
5 files changed, 62 insertions(+), 152 deletions(-)

--
2.26.2


2020-10-19 15:31:51

by Arvind Sankar

[permalink] [raw]
Subject: [PATCH 1/5] crypto: Use memzero_explicit() for clearing state

Without the barrier_data() inside memzero_explicit(), the compiler may
optimize away the state-clearing if it can tell that the state is not
used afterwards. At least in lib/crypto/sha256.c:__sha256_final(), the
function can get inlined into sha256(), in which case the memset is
optimized away.

Signed-off-by: Arvind Sankar <[email protected]>
---
include/crypto/sha1_base.h | 3 ++-
include/crypto/sha256_base.h | 3 ++-
include/crypto/sha512_base.h | 3 ++-
include/crypto/sm3_base.h | 3 ++-
lib/crypto/sha256.c | 2 +-
5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h
index 20fd1f7468af..a5d6033efef7 100644
--- a/include/crypto/sha1_base.h
+++ b/include/crypto/sha1_base.h
@@ -12,6 +12,7 @@
#include <crypto/sha.h>
#include <linux/crypto.h>
#include <linux/module.h>
+#include <linux/string.h>

#include <asm/unaligned.h>

@@ -101,7 +102,7 @@ static inline int sha1_base_finish(struct shash_desc *desc, u8 *out)
for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
put_unaligned_be32(sctx->state[i], digest++);

- *sctx = (struct sha1_state){};
+ memzero_explicit(sctx, sizeof(*sctx));
return 0;
}

diff --git a/include/crypto/sha256_base.h b/include/crypto/sha256_base.h
index 6ded110783ae..93f9fd21cc06 100644
--- a/include/crypto/sha256_base.h
+++ b/include/crypto/sha256_base.h
@@ -12,6 +12,7 @@
#include <crypto/sha.h>
#include <linux/crypto.h>
#include <linux/module.h>
+#include <linux/string.h>

#include <asm/unaligned.h>

@@ -105,7 +106,7 @@ static inline int sha256_base_finish(struct shash_desc *desc, u8 *out)
for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be32))
put_unaligned_be32(sctx->state[i], digest++);

- *sctx = (struct sha256_state){};
+ memzero_explicit(sctx, sizeof(*sctx));
return 0;
}

diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h
index fb19c77494dc..93ab73baa38e 100644
--- a/include/crypto/sha512_base.h
+++ b/include/crypto/sha512_base.h
@@ -12,6 +12,7 @@
#include <crypto/sha.h>
#include <linux/crypto.h>
#include <linux/module.h>
+#include <linux/string.h>

#include <asm/unaligned.h>

@@ -126,7 +127,7 @@ static inline int sha512_base_finish(struct shash_desc *desc, u8 *out)
for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64))
put_unaligned_be64(sctx->state[i], digest++);

- *sctx = (struct sha512_state){};
+ memzero_explicit(sctx, sizeof(*sctx));
return 0;
}

diff --git a/include/crypto/sm3_base.h b/include/crypto/sm3_base.h
index 1cbf9aa1fe52..2f3a32ab97bb 100644
--- a/include/crypto/sm3_base.h
+++ b/include/crypto/sm3_base.h
@@ -13,6 +13,7 @@
#include <crypto/sm3.h>
#include <linux/crypto.h>
#include <linux/module.h>
+#include <linux/string.h>
#include <asm/unaligned.h>

typedef void (sm3_block_fn)(struct sm3_state *sst, u8 const *src, int blocks);
@@ -104,7 +105,7 @@ static inline int sm3_base_finish(struct shash_desc *desc, u8 *out)
for (i = 0; i < SM3_DIGEST_SIZE / sizeof(__be32); i++)
put_unaligned_be32(sctx->state[i], digest++);

- *sctx = (struct sm3_state){};
+ memzero_explicit(sctx, sizeof(*sctx));
return 0;
}

diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 2321f6cb322f..d43bc39ab05e 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -265,7 +265,7 @@ static void __sha256_final(struct sha256_state *sctx, u8 *out, int digest_words)
put_unaligned_be32(sctx->state[i], &dst[i]);

/* Zeroize sensitive information. */
- memset(sctx, 0, sizeof(*sctx));
+ memzero_explicit(sctx, sizeof(*sctx));
}

void sha256_final(struct sha256_state *sctx, u8 *out)
--
2.26.2

2020-10-19 15:31:53

by Arvind Sankar

[permalink] [raw]
Subject: [PATCH 3/5] crypto: lib/sha256 - Clear W[] in sha256_update() instead of sha256_transform()

The temporary W[] array is currently zeroed out once every call to
sha256_transform(), i.e. once every 64 bytes of input data. Moving it to
sha256_update() instead so that it is cleared only once per update can
save about 2-3% of the total time taken to compute the digest, with a
reasonable memset() implementation, and considerably more (~20%) with a
bad one (eg the x86 purgatory currently uses a memset() coded in C).

Signed-off-by: Arvind Sankar <[email protected]>
---
lib/crypto/sha256.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 099cd11f83c1..c6bfeacc5b81 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -43,10 +43,9 @@ static inline void BLEND_OP(int I, u32 *W)
W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
}

-static void sha256_transform(u32 *state, const u8 *input)
+static void sha256_transform(u32 *state, const u8 *input, u32 *W)
{
u32 a, b, c, d, e, f, g, h, t1, t2;
- u32 W[64];
int i;

/* load the input */
@@ -200,15 +199,13 @@ static void sha256_transform(u32 *state, const u8 *input)

state[0] += a; state[1] += b; state[2] += c; state[3] += d;
state[4] += e; state[5] += f; state[6] += g; state[7] += h;
-
- /* clear any sensitive info... */
- memzero_explicit(W, 64 * sizeof(u32));
}

void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
{
unsigned int partial, done;
const u8 *src;
+ u32 W[64];

partial = sctx->count & 0x3f;
sctx->count += len;
@@ -223,11 +220,13 @@ void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
}

do {
- sha256_transform(sctx->state, src);
+ sha256_transform(sctx->state, src, W);
done += 64;
src = data + done;
} while (done + 63 < len);

+ memzero_explicit(W, sizeof(W));
+
partial = 0;
}
memcpy(sctx->buf + partial, src, len - done);
--
2.26.2