LinuxLists.cc - [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

2021-05-02 19:36:02

Subject: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

The S array does not need to be u32, u8 is enough
On machine which have efficient unaligned access, use u8 to save some
memory.

So, provide a version optimized for memory usage in such a case.

Based on my testing, the size of arc4_ctx is:
u8 version: 264
u32 version: 1032

On my machine, the u8 version is about 5% faster.
It save ~800 bytes of memory or stack depending on how arc4_ctx is stored.
It is likely to be more cache friendly.

It has been tested an Core i7-3770 with the following test program:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define u8 unsigned char
#define u32 unsigned int
#define true 1

struct arc4_ctx_8 {
u8 S[256];
u32 x, y;
};
struct arc4_ctx_32 {
u32 S[256];
u32 x, y;
};

#define S_type u8
int arc4_setkey_8(struct arc4_ctx_8 *ctx, const u8 *in_key, unsigned int key_len)
{
int i, j = 0, k = 0;

ctx->x = 1;
ctx->y = 0;

for (i = 0; i < 256; i++)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
ctx->S[j] = a;
if (++k >= key_len)
k = 0;
}

return 0;
}

void arc4_crypt_8(struct arc4_ctx_8 *ctx, u8 *out, const u8 *in, unsigned int len)
{
S_type *const S = ctx->S;
S_type a, b, ta, tb;
u32 x, y, ty;

if (len == 0)
return;

x = ctx->x;
y = ctx->y;

a = S[x];
y = (y + a) & 0xff;
b = S[y];

do {
S[y] = a;
a = (a + b) & 0xff;
S[x] = b;
x = (x + 1) & 0xff;
ta = S[x];
ty = (y + ta) & 0xff;
tb = S[ty];
*out++ = *in++ ^ S[a];
if (--len == 0)
break;
y = ty;
a = ta;
b = tb;
} while (true);

ctx->x = x;
ctx->y = y;
}

#undef S_type
#define S_type u32
int arc4_setkey_32(struct arc4_ctx_32 *ctx, const u8 *in_key, unsigned int key_len)
{
int i, j = 0, k = 0;

ctx->x = 1;
ctx->y = 0;

for (i = 0; i < 256; i++)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
ctx->S[j] = a;
if (++k >= key_len)
k = 0;
}

return 0;
}

void arc4_crypt_32(struct arc4_ctx_32 *ctx, u8 *out, const u8 *in, unsigned int len)
{
S_type *const S = ctx->S;
S_type a, b, ta, tb;
u32 x, y, ty;

if (len == 0)
return;

x = ctx->x;
y = ctx->y;

a = S[x];
y = (y + a) & 0xff;
b = S[y];

do {
S[y] = a;
a = (a + b) & 0xff;
S[x] = b;
x = (x + 1) & 0xff;
ta = S[x];
ty = (y + ta) & 0xff;
tb = S[ty];
*out++ = *in++ ^ S[a];
if (--len == 0)
break;
y = ty;
a = ta;
b = tb;
} while (true);

ctx->x = x;
ctx->y = y;
}

#define KEY "AZERTY"
#define in "AZERTYUIOP_QSDFGHJKLM_WXCVBN"

int main() {
long i;
struct arc4_ctx_8 ctx_8;
u8 out8[1024] = { };
struct arc4_ctx_32 ctx_32;
u8 out32[1024] = { };

arc4_setkey_8(&ctx_8, KEY, strlen(KEY));
arc4_crypt_8(&ctx_8, out8, in, strlen(in));

arc4_setkey_32(&ctx_32, KEY, strlen(KEY));
arc4_crypt_32(&ctx_32, out32, in, strlen(in));

printf("%ld vs %ld\n", sizeof(ctx_8), sizeof(ctx_32));
if (memcmp(out8, out32, 1024) == 0)
printf("Ok\n");
else
printf("Broken\n");

return 0;
}

Signed-off-by: Christophe JAILLET <[email protected]>
---
The idea came from code found in staging/rtl8712/
See at the top of:
https://elixir.bootlin.com/linux/v5.12/source/drivers/staging/rtl8712/rtl871x_security.c

More precisely, in an attempt to clean staging/rtl8712/, I triggered the
kernel test robot about some increasing stack usage:
https://lore.kernel.org/kernel-janitors/YHQUH+Nqc%[email protected]/T/#m832a01a9d1517e7efc4f671ed46deae9993d6ae9

The above patch works for me, but should be taken as a RFC.
---
include/crypto/arc4.h | 8 +++++++-
lib/crypto/arc4.c | 8 ++++----
2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/crypto/arc4.h b/include/crypto/arc4.h
index f3c22fe01704..39545ed486e2 100644
--- a/include/crypto/arc4.h
+++ b/include/crypto/arc4.h
@@ -12,8 +12,14 @@
#define ARC4_MAX_KEY_SIZE 256
#define ARC4_BLOCK_SIZE 1

+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#define S_type u8
+#else
+#define S_type u32
+#endif
+
struct arc4_ctx {
- u32 S[256];
+ S_type S[256];
u32 x, y;
};

diff --git a/lib/crypto/arc4.c b/lib/crypto/arc4.c
index c2020f19c652..e0be0c2a08d9 100644
--- a/lib/crypto/arc4.c
+++ b/lib/crypto/arc4.c
@@ -21,7 +21,7 @@ int arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len)
ctx->S[i] = i;

for (i = 0; i < 256; i++) {
- u32 a = ctx->S[i];
+ S_type a = ctx->S[i];

j = (j + in_key[k] + a) & 0xff;
ctx->S[i] = ctx->S[j];
@@ -36,9 +36,9 @@ EXPORT_SYMBOL(arc4_setkey);

void arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len)
{
- u32 *const S = ctx->S;
- u32 x, y, a, b;
- u32 ty, ta, tb;
+ S_type *const S = ctx->S;
+ S_type a, b, ta, tb;
+ u32 x, y, ty;

if (len == 0)
return;
--
2.30.2

2021-05-04 17:18:11

by Eric Biggers

[permalink] [raw]

Subject: Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> +#define S_type u8
> +#else
> +#define S_type u32
> +#endif
> +
> struct arc4_ctx {
> - u32 S[256];
> + S_type S[256];
> u32 x, y;
> };

Is it actually useful to keep both versions? It seems we could just use the u8
version everywhere. Note that there aren't actually any unaligned memory
accesses, so choosing the version conditionally on
CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd. What are you trying to
determine by checking that?

- Eric

2021-05-04 18:00:58

by Christophe JAILLET

[permalink] [raw]

Subject: Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

Le 04/05/2021 à 18:57, Eric Biggers a écrit :
> On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
>> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
>> +#define S_type u8
>> +#else
>> +#define S_type u32
>> +#endif
>> +
>> struct arc4_ctx {
>> - u32 S[256];
>> + S_type S[256];
>> u32 x, y;
>> };
>
> Is it actually useful to keep both versions? It seems we could just use the u8
> version everywhere. Note that there aren't actually any unaligned memory
> accesses, so choosing the version conditionally on
> CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd. What are you trying to
> determine by checking that?

Hi, this is a bad interpretation from me.

I thought that S[1] would likely use an odd address and would trigger an
unaligned access. But as we would read only 1 byte, this is not the case.

Looking at [1], we have : "At this point, it should be clear that
accessing a single byte (u8 or char) will never cause an unaligned
access, because all memory addresses are evenly divisible by one."

I wanted to avoid potential performance cost related to using char (i.e
u8) instead of int (i.e. u32).
On some architecture this could require some shift or masking or
whatever to "unpack" the values of S.

[1]:
https://www.kernel.org/doc/html/latest/core-api/unaligned-memory-access.html

CJ

>
> - Eric
>

2021-05-04 19:37:56

by Eric Biggers

[permalink] [raw]

Subject: Re: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

On Tue, May 04, 2021 at 07:59:38PM +0200, Christophe JAILLET wrote:
> Le 04/05/2021 ? 18:57, Eric Biggers a ?crit?:
> > On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> > > +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> > > +#define S_type u8
> > > +#else
> > > +#define S_type u32
> > > +#endif
> > > +
> > > struct arc4_ctx {
> > > - u32 S[256];
> > > + S_type S[256];
> > > u32 x, y;
> > > };
> >
> > Is it actually useful to keep both versions? It seems we could just use the u8
> > version everywhere. Note that there aren't actually any unaligned memory
> > accesses, so choosing the version conditionally on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd. What are you trying to
> > determine by checking that?
>
> Hi, this is a bad interpretation from me.
>
> I thought that S[1] would likely use an odd address and would trigger an
> unaligned access. But as we would read only 1 byte, this is not the case.
>
> Looking at [1], we have : "At this point, it should be clear that accessing
> a single byte (u8 or char) will never cause an unaligned access, because all
> memory addresses are evenly divisible by one."
>
>
> I wanted to avoid potential performance cost related to using char (i.e u8)
> instead of int (i.e. u32).
> On some architecture this could require some shift or masking or whatever to
> "unpack" the values of S.
>
>
> [1]:
> https://www.kernel.org/doc/html/latest/core-api/unaligned-memory-access.html
>
> CJ
>

arc4 is an insecure cipher which is only supported for use in legacy protocols.
So we don't really need to worry about optimizing performance on every
architecture. If the byte-based version is *usually* faster as well as uses
less memory, we probably should just use it everywhere.

- Eric

2021-05-05 10:46:30

by David Laight

[permalink] [raw]

Subject: RE: [RFC PATCH] crypto: arc4: Implement a version optimized for memory usage

From: Christophe JAILLET
> Sent: 04 May 2021 19:00
>
> Le 04/05/2021 à 18:57, Eric Biggers a écrit :
> > On Sun, May 02, 2021 at 09:29:46PM +0200, Christophe JAILLET wrote:
> >> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
> >> +#define S_type u8
> >> +#else
> >> +#define S_type u32
> >> +#endif
> >> +
> >> struct arc4_ctx {
> >> - u32 S[256];
> >> + S_type S[256];
> >> u32 x, y;
> >> };
> >
> > Is it actually useful to keep both versions? It seems we could just use the u8
> > version everywhere. Note that there aren't actually any unaligned memory
> > accesses, so choosing the version conditionally on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS seems odd. What are you trying to
> > determine by checking that?
>
> Hi, this is a bad interpretation from me.
...
>
> I wanted to avoid potential performance cost related to using char (i.e
> u8) instead of int (i.e. u32).
> On some architecture this could require some shift or masking or
> whatever to "unpack" the values of S.

The only architecture that Linux ran on where the hardware
did RMW accesses for byte writes was some very old alpha cpu.
Even more recent alpha supported byte writes to memory.

On many architectures (not x86 or arm) indexing a byte array
is better because it saves the instruction to multiply the index by 4.
On x86-64 you want to be using 'unsigned int' for array indexes
so the compiler doesn't have to emit the instruction to sign extend
a 32bit int to 64 bits (sometimes it knows it can't be needed).

FWIW with a modern compiler all those temporaries are pointless.
The number of lines of code can be halved.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)