From: =?UTF-8?B?T25kcmVqIE1vc27DocSNZWs=?= Subject: Re: [PATCH v4] crypto: gf128mul - define gf128mul_x_* in gf128mul.h Date: Sat, 1 Apr 2017 17:21:57 +0200 Message-ID: References: <20170401151755.11875-1-omosnacek@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Cc: "David S. Miller" , linux-crypto@vger.kernel.org, Jeffrey Walton , Milan Broz , Ondrej Mosnacek , Eric Biggers To: Herbert Xu Return-path: Received: from mail-lf0-f67.google.com ([209.85.215.67]:36102 "EHLO mail-lf0-f67.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751545AbdDAPWU (ORCPT ); Sat, 1 Apr 2017 11:22:20 -0400 Received: by mail-lf0-f67.google.com with SMTP id n78so9615143lfi.3 for ; Sat, 01 Apr 2017 08:22:19 -0700 (PDT) In-Reply-To: Sender: linux-crypto-owner@vger.kernel.org List-ID: Never mind, Gmail is confusing me... there is indeed "v4" in the subject :) O.M. 2017-04-01 17:19 GMT+02:00 Ondrej Mosn=C3=A1=C4=8Dek : > Oops, sorry, wrong prefix... > > 2017-04-01 17:17 GMT+02:00 Ondrej Mosnacek : >> The gf128mul_x_ble function is currently defined in gf128mul.c, because >> it depends on the gf128mul_table_be multiplication table. >> >> However, since the function is very small and only uses two values from >> the table, it is better for it to be defined as inline function in >> gf128mul.h. That way, the function can be inlined by the compiler for >> better performance. >> >> For consistency, the other gf128mul_x_* functions are also moved to the >> header file. In addition, the code is rewritten to be constant-time. >> >> After this change, the speed of the generic 'xts(aes)' implementation >> increased from ~225 MiB/s to ~235 MiB/s (measured using 'cryptsetup >> benchmark -c aes-xts-plain64' on an Intel system with CRYPTO_AES_X86_64 >> and CRYPTO_AES_NI_INTEL disabled). >> >> Signed-off-by: Ondrej Mosnacek >> Cc: Eric Biggers >> --- >> v3 -> v4: a faster version of gf128mul_x_lle >> v2 -> v3: constant-time implementation >> v1 -> v2: move all _x_ functions to the header, not just gf128mul_x_ble >> >> crypto/gf128mul.c | 33 +--------------------------- >> include/crypto/gf128mul.h | 55 ++++++++++++++++++++++++++++++++++++++++= +++++-- >> 2 files changed, 54 insertions(+), 34 deletions(-) >> >> diff --git a/crypto/gf128mul.c b/crypto/gf128mul.c >> index 04facc0..dc01212 100644 >> --- a/crypto/gf128mul.c >> +++ b/crypto/gf128mul.c >> @@ -130,43 +130,12 @@ static const u16 gf128mul_table_le[256] =3D gf128m= ul_dat(xda_le); >> static const u16 gf128mul_table_be[256] =3D gf128mul_dat(xda_be); >> >> /* >> - * The following functions multiply a field element by x or by x^8 in >> + * The following functions multiply a field element by x^8 in >> * the polynomial field representation. They use 64-bit word operation= s >> * to gain speed but compensate for machine endianness and hence work >> * correctly on both styles of machine. >> */ >> >> -static void gf128mul_x_lle(be128 *r, const be128 *x) >> -{ >> - u64 a =3D be64_to_cpu(x->a); >> - u64 b =3D be64_to_cpu(x->b); >> - u64 _tt =3D gf128mul_table_le[(b << 7) & 0xff]; >> - >> - r->b =3D cpu_to_be64((b >> 1) | (a << 63)); >> - r->a =3D cpu_to_be64((a >> 1) ^ (_tt << 48)); >> -} >> - >> -static void gf128mul_x_bbe(be128 *r, const be128 *x) >> -{ >> - u64 a =3D be64_to_cpu(x->a); >> - u64 b =3D be64_to_cpu(x->b); >> - u64 _tt =3D gf128mul_table_be[a >> 63]; >> - >> - r->a =3D cpu_to_be64((a << 1) | (b >> 63)); >> - r->b =3D cpu_to_be64((b << 1) ^ _tt); >> -} >> - >> -void gf128mul_x_ble(be128 *r, const be128 *x) >> -{ >> - u64 a =3D le64_to_cpu(x->a); >> - u64 b =3D le64_to_cpu(x->b); >> - u64 _tt =3D gf128mul_table_be[b >> 63]; >> - >> - r->a =3D cpu_to_le64((a << 1) ^ _tt); >> - r->b =3D cpu_to_le64((b << 1) | (a >> 63)); >> -} >> -EXPORT_SYMBOL(gf128mul_x_ble); >> - >> static void gf128mul_x8_lle(be128 *x) >> { >> u64 a =3D be64_to_cpu(x->a); >> diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h >> index 0bc9b5f..35ced9d 100644 >> --- a/include/crypto/gf128mul.h >> +++ b/include/crypto/gf128mul.h >> @@ -49,6 +49,7 @@ >> #ifndef _CRYPTO_GF128MUL_H >> #define _CRYPTO_GF128MUL_H >> >> +#include >> #include >> #include >> >> @@ -163,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b); >> >> void gf128mul_bbe(be128 *a, const be128 *b); >> >> -/* multiply by x in ble format, needed by XTS */ >> -void gf128mul_x_ble(be128 *a, const be128 *b); >> +/* >> + * The following functions multiply a field element by x in >> + * the polynomial field representation. They use 64-bit word operation= s >> + * to gain speed but compensate for machine endianness and hence work >> + * correctly on both styles of machine. >> + * >> + * They are defined here for performance. >> + */ >> + >> +static inline u64 gf128mul_mask_from_bit(u64 x, int which) >> +{ >> + /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 := 0' */ >> + return ((s64)(x << (63 - which)) >> 63); >> +} >> + >> +static inline void gf128mul_x_lle(be128 *r, const be128 *x) >> +{ >> + u64 a =3D be64_to_cpu(x->a); >> + u64 b =3D be64_to_cpu(x->b); >> + >> + /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48 >> + * (see crypto/gf128mul.c): */ >> + u64 _tt =3D gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56); >> + >> + r->b =3D cpu_to_be64((b >> 1) | (a << 63)); >> + r->a =3D cpu_to_be64((a >> 1) ^ _tt); >> +} >> + >> +static inline void gf128mul_x_bbe(be128 *r, const be128 *x) >> +{ >> + u64 a =3D be64_to_cpu(x->a); >> + u64 b =3D be64_to_cpu(x->b); >> + >> + /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul= .c): */ >> + u64 _tt =3D gf128mul_mask_from_bit(a, 63) & 0x87; >> + >> + r->a =3D cpu_to_be64((a << 1) | (b >> 63)); >> + r->b =3D cpu_to_be64((b << 1) ^ _tt); >> +} >> + >> +/* needed by XTS */ >> +static inline void gf128mul_x_ble(be128 *r, const be128 *x) >> +{ >> + u64 a =3D le64_to_cpu(x->a); >> + u64 b =3D le64_to_cpu(x->b); >> + >> + /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul= .c): */ >> + u64 _tt =3D gf128mul_mask_from_bit(b, 63) & 0x87; >> + >> + r->a =3D cpu_to_le64((a << 1) ^ _tt); >> + r->b =3D cpu_to_le64((b << 1) | (a >> 63)); >> +} >> >> /* 4k table optimization */ >> >> -- >> 2.9.3 >>