Following patches provide code cleanup and performance improvements for
crypto/sha1.c. They are split up for easier auditing, along with
relevant measurements.
Nicolas
The current code unconditionally copy the first block for every call to
sha1_update(). This can be avoided if there is no pending partial block.
This is always the case on the first call to sha1_update() (if the length
is >= 64 of course0.
In the case where sha1_update() is called successively with len=64,
a 8.5% performance increase can be observed on i386, and 8.2% on ARm.
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -48,23 +48,26 @@
static void sha1_update(void *ctx, const u8 *data, unsigned int len)
{
struct sha1_ctx *sctx = ctx;
- unsigned int i, j;
+ unsigned int partial, done;
u32 temp[SHA_WORKSPACE_WORDS];
- j = (sctx->count >> 3) & 0x3f;
+ partial = (sctx->count >> 3) & 0x3f;
sctx->count += len << 3;
+ done = 0;
- if ((j + len) > 63) {
- memcpy(&sctx->buffer[j], data, (i = 64-j));
- sha_transform(sctx->state, sctx->buffer, temp);
- for ( ; i + 63 < len; i += 64) {
- sha_transform(sctx->state, &data[i], temp);
+ if ((partial + len) > 63) {
+ if (partial) {
+ done = 64 - partial;
+ memcpy(sctx->buffer + partial, data, done);
+ sha_transform(sctx->state, sctx->buffer, temp);
+ partial = 0;
}
- j = 0;
+ for ( ; done + 63 < len; done += 64)
+ sha_transform(sctx->state, data + done, temp);
}
- else i = 0;
+ if (len - done)
+ memcpy(sctx->buffer + partial, data + done, len - done);
memset(temp, 0, sizeof(temp));
- memcpy(&sctx->buffer[j], &data[i], len - i);
}
This patch avoids shifting the count left and right needlessly for each
call to sha1_update(). It instead can be done only once at the end in
sha1_final().
Keeping the previous test example (sha1_update() successively called with
len=64), a 1.3% performance increase can be observed on i386, or 0.2% on
ARM. The generated code is also smaller on ARM.
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -51,8 +51,8 @@
unsigned int partial, done;
u32 temp[SHA_WORKSPACE_WORDS];
- partial = (sctx->count >> 3) & 0x3f;
- sctx->count += len << 3;
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
done = 0;
if ((partial + len) > 63) {
@@ -80,7 +80,7 @@
u8 bits[8] = { 0, };
static const u8 padding[64] = { 0x80, };
- t = sctx->count;
+ t = sctx->count << 3;
bits[7] = 0xff & t; t>>=8;
bits[6] = 0xff & t; t>>=8;
bits[5] = 0xff & t; t>>=8;
@@ -91,7 +91,7 @@
bits[0] = 0xff & t;
/* Pad out to 56 mod 64 */
- index = (sctx->count >> 3) & 0x3f;
+ index = sctx->count & 0x3f;
padlen = (index < 56) ? (56 - index) : ((64+56) - index);
sha1_update(sctx, padding, padlen);
The final length buffer construction in sha1_final() is moved just before
where it is used. This makes for slightly more readable code and actually
more efficient as well since the compiler doesn't have to bother preserving
the bits array across the first call to sha1_update(). In fact it makes for
slightly smaller stack usage on i386 or slightly smaller code size on ARM
(depending on your GCC version that could vary of course).
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -76,11 +76,17 @@
{
struct sha1_ctx *sctx = ctx;
u32 i, j, index, padlen;
- u64 t;
- u8 bits[8] = { 0, };
+ u64 t, count = sctx->count;
+ u8 bits[8];
static const u8 padding[64] = { 0x80, };
- t = sctx->count << 3;
+ /* Pad out to 56 mod 64 */
+ index = count & 0x3f;
+ padlen = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha1_update(sctx, padding, padlen);
+
+ /* Append length */
+ t = count << 3;
bits[7] = 0xff & t; t>>=8;
bits[6] = 0xff & t; t>>=8;
bits[5] = 0xff & t; t>>=8;
@@ -89,13 +95,6 @@
bits[2] = 0xff & t; t>>=8;
bits[1] = 0xff & t; t>>=8;
bits[0] = 0xff & t;
-
- /* Pad out to 56 mod 64 */
- index = sctx->count & 0x3f;
- padlen = (index < 56) ? (56 - index) : ((64+56) - index);
- sha1_update(sctx, padding, padlen);
-
- /* Append length */
sha1_update(sctx, bits, sizeof bits);
/* Store state in digest */
[PATCH 4/5] crypto/sha1.c: avoid successively shifting a long long
Shifting a long long about 7 times to store the length bits is suboptimal
on most 32-bit architectures. Use a 32 bit scratch instead.
This provides an appreciable code reduction considering the _whole_
of the sha1_final() function:
arch old size new size reduction
---------------------------------------------------------
i386 0xe0 0xc4 12.5%
arm 0x15c 0xe8 33.3%
Smaller code in this case is of course faster code.
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -75,8 +75,8 @@
static void sha1_final(void* ctx, u8 *out)
{
struct sha1_ctx *sctx = ctx;
- u32 i, j, index, padlen;
- u64 t, count = sctx->count;
+ u64 count = sctx->count;
+ u32 i, j, index, padlen, t;
u8 bits[8];
static const u8 padding[64] = { 0x80, };
@@ -90,7 +90,8 @@
bits[7] = 0xff & t; t>>=8;
bits[6] = 0xff & t; t>>=8;
bits[5] = 0xff & t; t>>=8;
- bits[4] = 0xff & t; t>>=8;
+ bits[4] = 0xff & t;
+ t = count >> (32 - 3);
bits[3] = 0xff & t; t>>=8;
bits[2] = 0xff & t; t>>=8;
bits[1] = 0xff & t; t>>=8;
We don't need j nor t2. Simplify the code a bit by reusing t and
incrementing the 'out' pointer (it is a bit easier on the compiler
as it reduce register pressure).
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -76,7 +76,7 @@
{
struct sha1_ctx *sctx = ctx;
u64 count = sctx->count;
- u32 i, j, index, padlen, t;
+ u32 i, index, padlen, t;
u8 bits[8];
static const u8 padding[64] = { 0x80, };
@@ -99,12 +99,13 @@
sha1_update(sctx, bits, sizeof bits);
/* Store state in digest */
- for (i = j = 0; i < 5; i++, j += 4) {
- u32 t2 = sctx->state[i];
- out[j+3] = t2 & 0xff; t2>>=8;
- out[j+2] = t2 & 0xff; t2>>=8;
- out[j+1] = t2 & 0xff; t2>>=8;
- out[j ] = t2 & 0xff;
+ for (i = 0; i < 5; i++) {
+ t = sctx->state[i];
+ out[3] = t & 0xff; t>>=8;
+ out[2] = t & 0xff; t>>=8;
+ out[1] = t & 0xff; t>>=8;
+ out[0] = t & 0xff;
+ out += 4;
}
/* Wipe context */
Shifting a long long about 7 times to store the length bits is suboptimal
on most 32-bit architectures. Use a 32 bit scratch instead.
This provides an appreciable code reduction considering the _whole_
of the sha1_final() function:
arch old size new size reduction
---------------------------------------------------------
i386 0xe0 0xc4 12.5%
arm 0x15c 0xe8 33.3%
Smaller code in this case is of course faster code.
Signed-off-by: Nicolas Pitre <[email protected]>
Index: linux-2.6/crypto/sha1.c
===================================================================
--- linux-2.6.orig/crypto/sha1.c
+++ linux-2.6/crypto/sha1.c
@@ -75,8 +75,8 @@
static void sha1_final(void* ctx, u8 *out)
{
struct sha1_ctx *sctx = ctx;
- u32 i, j, index, padlen;
- u64 t, count = sctx->count;
+ u64 count = sctx->count;
+ u32 i, j, index, padlen, t;
u8 bits[8];
static const u8 padding[64] = { 0x80, };
@@ -90,7 +90,8 @@
bits[7] = 0xff & t; t>>=8;
bits[6] = 0xff & t; t>>=8;
bits[5] = 0xff & t; t>>=8;
- bits[4] = 0xff & t; t>>=8;
+ bits[4] = 0xff & t;
+ t = count >> (32 - 3);
bits[3] = 0xff & t; t>>=8;
bits[2] = 0xff & t; t>>=8;
bits[1] = 0xff & t; t>>=8;
On Mon, Oct 24, 2005 at 05:57:06PM +0000, Nicolas Pitre wrote:
>
> This patch avoids shifting the count left and right needlessly for each
> call to sha1_update(). It instead can be done only once at the end in
> sha1_final().
>
> Keeping the previous test example (sha1_update() successively called with
> len=64), a 1.3% performance increase can be observed on i386, or 0.2% on
> ARM. The generated code is also smaller on ARM.
Patch applied.
So in summary, patch 1 has been applied with some additional changes.
Patch 2 is applied as is except for a change to accomodate the cpu->be
change from Denis.
Patches 3-5 have been made redundant by the cpu->be change.
Thanks again for your efforts in making sha1 smaller and faster.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt