2023-04-11 15:07:45

by Nathan Huckleberry

[permalink] [raw]
Subject: Re: [PATCH v4 4/4] RISC-V: crypto: add accelerated GCM GHASH implementation

Hey Heiko,

Thanks for the patch, it generally looks good. A few comments.

On Wed, Mar 29, 2023 at 7:08 AM Heiko Stuebner <[email protected]> wrote:
>
> From: Heiko Stuebner <[email protected]>
>
> With different sets of available extensions a number of different
> implementation variants are possible. Quite a number of them are already
> implemented in openSSL or are in the process of being implemented, so pick
> the relevant openSSL coden and add suitable glue code similar to arm64 and
> powerpc to use it for kernel-specific cryptography.
>
> The prioritization of the algorithms follows the ifdef chain for the
> assembly callbacks done in openssl but here algorithms will get registered
> separately so that all of them can be part of the crypto selftests.
>
> The crypto subsystem will select the most performant of all registered
> algorithms on the running system but will selftest all registered ones.
>
> In a first step this adds scalar variants using the Zbc, Zbb and
> possible Zbkb (bitmanip crypto extension) and the perl implementation
> stems from openSSL pull request on
> https://github.com/openssl/openssl/pull/20078
>
> Co-developed-by: Christoph Müllner <[email protected]>
> Signed-off-by: Christoph Müllner <[email protected]>
> Signed-off-by: Heiko Stuebner <[email protected]>
> ---
> arch/riscv/crypto/Kconfig | 13 +
> arch/riscv/crypto/Makefile | 14 +
> arch/riscv/crypto/ghash-riscv64-glue.c | 258 ++++++++++++++++
> arch/riscv/crypto/ghash-riscv64-zbc.pl | 400 +++++++++++++++++++++++++
> arch/riscv/crypto/riscv.pm | 231 ++++++++++++++
> 5 files changed, 916 insertions(+)
> create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c
> create mode 100644 arch/riscv/crypto/ghash-riscv64-zbc.pl
> create mode 100644 arch/riscv/crypto/riscv.pm
>
> diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
> index 10d60edc0110..cd2237923e68 100644
> --- a/arch/riscv/crypto/Kconfig
> +++ b/arch/riscv/crypto/Kconfig
> @@ -2,4 +2,17 @@
>
> menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
>
> +config CRYPTO_GHASH_RISCV64
> + tristate "Hash functions: GHASH"
> + depends on 64BIT && RISCV_ISA_ZBC
> + select CRYPTO_HASH
> + select CRYPTO_LIB_GF128MUL
> + help
> + GCM GHASH function (NIST SP800-38D)
> +
> + Architecture: riscv64 using one of:
> + - Zbc extension
> + - Zbc + Zbb extensions
> + - Zbc + Zbkb extensions
> +
> endmenu
> diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
> index b3b6332c9f6d..0a158919e9da 100644
> --- a/arch/riscv/crypto/Makefile
> +++ b/arch/riscv/crypto/Makefile
> @@ -2,3 +2,17 @@
> #
> # linux/arch/riscv/crypto/Makefile
> #
> +
> +obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
> +ghash-riscv64-y := ghash-riscv64-glue.o
> +ifdef CONFIG_RISCV_ISA_ZBC
> +ghash-riscv64-y += ghash-riscv64-zbc.o
> +endif
> +
> +quiet_cmd_perlasm = PERLASM $@
> + cmd_perlasm = $(PERL) $(<) void $(@)
> +
> +$(obj)/ghash-riscv64-zbc.S: $(src)/ghash-riscv64-zbc.pl
> + $(call cmd,perlasm)
> +
> +clean-files += ghash-riscv64-zbc.S
> diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c
> new file mode 100644
> index 000000000000..5ab704c49539
> --- /dev/null
> +++ b/arch/riscv/crypto/ghash-riscv64-glue.c
> @@ -0,0 +1,258 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * RISC-V optimized GHASH routines
> + *
> + * Copyright (C) 2023 VRULL GmbH
> + * Author: Heiko Stuebner <[email protected]>
> + */
> +
> +#include <linux/types.h>
> +#include <linux/err.h>
> +#include <linux/crypto.h>
> +#include <linux/module.h>
> +#include <asm/simd.h>
> +#include <crypto/ghash.h>
> +#include <crypto/internal/hash.h>
> +#include <crypto/internal/simd.h>
> +
> +/* Zbc (optional with zbkb improvements) */
> +void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
> + const u8 *inp, size_t len);
> +void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
> + const u8 *inp, size_t len);
> +
> +struct riscv64_ghash_ctx {
> + void (*ghash_func)(u64 Xi[2], const u128 Htable[16],
> + const u8 *inp, size_t len);
> +
> + /* key used by vector asm */
> + u128 htable[16];

This field looks too big. The assembly only loads the first 128-byte
value from this table.

Is this copied from another implementation? There's an optimization
where you precompute the first N powers of H so that you can perform 1
finite field reduction for every N multiplications, but it doesn't
look like that's being used here.

> + /* key used by software fallback */
> + be128 key;
> +};
> +
> +struct riscv64_ghash_desc_ctx {
> + u64 shash[2];
> + u8 buffer[GHASH_DIGEST_SIZE];
> + int bytes;
> +};
> +
> +static int riscv64_ghash_init(struct shash_desc *desc)
> +{
> + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +
> + dctx->bytes = 0;
> + memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
> + return 0;
> +}
> +
> +#ifdef CONFIG_RISCV_ISA_ZBC
> +
> +#define RISCV64_ZBC_SETKEY(VARIANT, GHASH) \
> +void gcm_init_rv64i_ ## VARIANT(u128 Htable[16], const u64 Xi[2]); \
> +static int riscv64_zbc_ghash_setkey_ ## VARIANT(struct crypto_shash *tfm, \
> + const u8 *key, \
> + unsigned int keylen) \
> +{ \
> + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(tfm)); \
> + const u64 k[2] = { cpu_to_be64(((const u64 *)key)[0]), \
> + cpu_to_be64(((const u64 *)key)[1]) }; \
> + \
> + if (keylen != GHASH_BLOCK_SIZE) \
> + return -EINVAL; \
> + \
> + memcpy(&ctx->key, key, GHASH_BLOCK_SIZE); \
> + gcm_init_rv64i_ ## VARIANT(ctx->htable, k); \
> + \
> + ctx->ghash_func = gcm_ghash_rv64i_ ## GHASH; \
> + \
> + return 0; \
> +}

I'd prefer three identical functions over a macro here. Code searching
tools and compiler warnings are significantly worse with macros.

> +
> +static int riscv64_zbc_ghash_update(struct shash_desc *desc,
> + const u8 *src, unsigned int srclen)
> +{
> + unsigned int len;
> + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
> + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +
> + if (dctx->bytes) {
> + if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
> + memcpy(dctx->buffer + dctx->bytes, src,
> + srclen);
> + dctx->bytes += srclen;
> + return 0;
> + }
> + memcpy(dctx->buffer + dctx->bytes, src,
> + GHASH_DIGEST_SIZE - dctx->bytes);
> +
> + ctx->ghash_func(dctx->shash, ctx->htable,
> + dctx->buffer, GHASH_DIGEST_SIZE);
> +
> + src += GHASH_DIGEST_SIZE - dctx->bytes;
> + srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
> + dctx->bytes = 0;
> + }
> + len = srclen & ~(GHASH_DIGEST_SIZE - 1);
> +
> + if (len) {
> + gcm_ghash_rv64i_zbc(dctx->shash, ctx->htable,
> + src, len);
> + src += len;
> + srclen -= len;
> + }
> +
> + if (srclen) {
> + memcpy(dctx->buffer, src, srclen);
> + dctx->bytes = srclen;
> + }
> + return 0;
> +}
> +
> +static int riscv64_zbc_ghash_final(struct shash_desc *desc, u8 *out)
> +{
> + int i;
> + struct riscv64_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
> + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
> +
> + if (dctx->bytes) {
> + for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
> + dctx->buffer[i] = 0;
> + ctx->ghash_func(dctx->shash, ctx->htable,
> + dctx->buffer, GHASH_DIGEST_SIZE);

Can we do this without an indirect call?

> + dctx->bytes = 0;
> + }
> + memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
> + return 0;
> +}
> +
> +RISCV64_ZBC_SETKEY(zbc, zbc);
> +struct shash_alg riscv64_zbc_ghash_alg = {
> + .digestsize = GHASH_DIGEST_SIZE,
> + .init = riscv64_ghash_init,
> + .update = riscv64_zbc_ghash_update,
> + .final = riscv64_zbc_ghash_final,
> + .setkey = riscv64_zbc_ghash_setkey_zbc,
> + .descsize = sizeof(struct riscv64_ghash_desc_ctx)
> + + sizeof(struct ghash_desc_ctx),
> + .base = {
> + .cra_name = "ghash",
> + .cra_driver_name = "riscv64_zbc_ghash",
> + .cra_priority = 250,
> + .cra_blocksize = GHASH_BLOCK_SIZE,
> + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
> + .cra_module = THIS_MODULE,
> + },
> +};
> +
> +RISCV64_ZBC_SETKEY(zbc__zbb, zbc);
> +struct shash_alg riscv64_zbc_zbb_ghash_alg = {
> + .digestsize = GHASH_DIGEST_SIZE,
> + .init = riscv64_ghash_init,
> + .update = riscv64_zbc_ghash_update,
> + .final = riscv64_zbc_ghash_final,
> + .setkey = riscv64_zbc_ghash_setkey_zbc__zbb,
> + .descsize = sizeof(struct riscv64_ghash_desc_ctx)
> + + sizeof(struct ghash_desc_ctx),
> + .base = {
> + .cra_name = "ghash",
> + .cra_driver_name = "riscv64_zbc_zbb_ghash",
> + .cra_priority = 251,
> + .cra_blocksize = GHASH_BLOCK_SIZE,
> + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
> + .cra_module = THIS_MODULE,
> + },
> +};
> +
> +RISCV64_ZBC_SETKEY(zbc__zbkb, zbc__zbkb);
> +struct shash_alg riscv64_zbc_zbkb_ghash_alg = {
> + .digestsize = GHASH_DIGEST_SIZE,
> + .init = riscv64_ghash_init,
> + .update = riscv64_zbc_ghash_update,
> + .final = riscv64_zbc_ghash_final,
> + .setkey = riscv64_zbc_ghash_setkey_zbc__zbkb,
> + .descsize = sizeof(struct riscv64_ghash_desc_ctx)
> + + sizeof(struct ghash_desc_ctx),
> + .base = {
> + .cra_name = "ghash",
> + .cra_driver_name = "riscv64_zbc_zbkb_ghash",
> + .cra_priority = 252,
> + .cra_blocksize = GHASH_BLOCK_SIZE,
> + .cra_ctxsize = sizeof(struct riscv64_ghash_ctx),
> + .cra_module = THIS_MODULE,
> + },
> +};
> +
> +#endif /* CONFIG_RISCV_ISA_ZBC */
> +
> +#define RISCV64_DEFINED_GHASHES 7
> +
> +static struct shash_alg *riscv64_ghashes[RISCV64_DEFINED_GHASHES];
> +static int num_riscv64_ghashes;
> +
> +static int __init riscv64_ghash_register(struct shash_alg *ghash)
> +{
> + int ret;
> +
> + ret = crypto_register_shash(ghash);
> + if (ret < 0) {
> + int i;
> +
> + for (i = num_riscv64_ghashes - 1; i >= 0 ; i--)
> + crypto_unregister_shash(riscv64_ghashes[i]);
> +
> + num_riscv64_ghashes = 0;
> +
> + return ret;
> + }
> +
> + pr_debug("Registered RISC-V ghash %s\n", ghash->base.cra_driver_name);
> + riscv64_ghashes[num_riscv64_ghashes] = ghash;
> + num_riscv64_ghashes++;
> + return 0;
> +}
> +
> +static int __init riscv64_ghash_mod_init(void)
> +{
> + int ret = 0;
> +
> +#ifdef CONFIG_RISCV_ISA_ZBC
> + if (riscv_isa_extension_available(NULL, ZBC)) {
> + ret = riscv64_ghash_register(&riscv64_zbc_ghash_alg);
> + if (ret < 0)
> + return ret;
> +
> + if (riscv_isa_extension_available(NULL, ZBB)) {
> + ret = riscv64_ghash_register(&riscv64_zbc_zbb_ghash_alg);
> + if (ret < 0)
> + return ret;
> + }
> +
> + if (riscv_isa_extension_available(NULL, ZBKB)) {
> + ret = riscv64_ghash_register(&riscv64_zbc_zbkb_ghash_alg);
> + if (ret < 0)
> + return ret;
> + }
> + }
> +#endif
> +
> + return 0;
> +}
> +
> +static void __exit riscv64_ghash_mod_fini(void)
> +{
> + int i;
> +
> + for (i = num_riscv64_ghashes - 1; i >= 0 ; i--)
> + crypto_unregister_shash(riscv64_ghashes[i]);
> +
> + num_riscv64_ghashes = 0;
> +}
> +
> +module_init(riscv64_ghash_mod_init);
> +module_exit(riscv64_ghash_mod_fini);
> +
> +MODULE_DESCRIPTION("GSM GHASH (accelerated)");
> +MODULE_AUTHOR("Heiko Stuebner <[email protected]>");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS_CRYPTO("ghash");
> diff --git a/arch/riscv/crypto/ghash-riscv64-zbc.pl b/arch/riscv/crypto/ghash-riscv64-zbc.pl
> new file mode 100644
> index 000000000000..691231ffa11c
> --- /dev/null
> +++ b/arch/riscv/crypto/ghash-riscv64-zbc.pl
> @@ -0,0 +1,400 @@
> +#! /usr/bin/env perl
> +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
> +#
> +# Licensed under the Apache License 2.0 (the "License"). You may not use
> +# this file except in compliance with the License. You can obtain a copy
> +# in the file LICENSE in the source distribution or at
> +# https://www.openssl.org/source/license.html
> +
> +use strict;
> +use warnings;
> +
> +use FindBin qw($Bin);
> +use lib "$Bin";
> +use lib "$Bin/../../perlasm";
> +use riscv;
> +
> +# $output is the last argument if it looks like a file (it has an extension)
> +# $flavour is the first argument if it doesn't look like a file
> +my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
> +my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
> +
> +$output and open STDOUT,">$output";
> +
> +my $code=<<___;
> +.text
> +___
> +
> +################################################################################
> +# void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
> +# void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
> +# void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
> +#
> +# input: H: 128-bit H - secret parameter E(K, 0^128)
> +# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
> +# gcm_ghash_rv64i_zbc*
> +#
> +# All callers of this function revert the byte-order unconditionally
> +# on little-endian machines. So we need to revert the byte-order back.
> +# Additionally we reverse the bits of each byte.
> +
> +{
> +my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_init_rv64i_zbc
> +.type gcm_init_rv64i_zbc,\@function
> +gcm_init_rv64i_zbc:
> + ld $VAL0,0($H)
> + ld $VAL1,8($H)
> + @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
> + @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
> + @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
> + @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
> + ret
> +.size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
> +___
> +}
> +
> +{
> +my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_init_rv64i_zbc__zbb
> +.type gcm_init_rv64i_zbc__zbb,\@function
> +gcm_init_rv64i_zbc__zbb:
> + ld $VAL0,0($H)
> + ld $VAL1,8($H)
> + @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
> + @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
> + @{[rev8 $VAL0, $VAL0]}
> + @{[rev8 $VAL1, $VAL1]}
> + sd $VAL0,0($Htable)
> + sd $VAL1,8($Htable)
> + ret
> +.size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
> +___
> +}
> +
> +{
> +my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_init_rv64i_zbc__zbkb
> +.type gcm_init_rv64i_zbc__zbkb,\@function
> +gcm_init_rv64i_zbc__zbkb:
> + ld $TMP0,0($H)
> + ld $TMP1,8($H)
> + @{[brev8 $TMP0, $TMP0]}
> + @{[brev8 $TMP1, $TMP1]}
> + @{[rev8 $TMP0, $TMP0]}
> + @{[rev8 $TMP1, $TMP1]}
> + sd $TMP0,0($Htable)
> + sd $TMP1,8($Htable)
> + ret
> +.size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
> +___
> +}
> +
> +################################################################################
> +# void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
> +# void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
> +#
> +# input: Xi: current hash value
> +# Htable: copy of H
> +# output: Xi: next hash value Xi
> +#
> +# Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
> +# extensions. Using the no-Karatsuba approach and clmul for the final reduction.
> +# This results in an implementation with minimized number of instructions.
> +# HW with clmul latencies higher than 2 cycles might observe a performance
> +# improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
> +# might observe a performance improvement with additionally converting the
> +# reduction to shift&xor. For a full discussion of this estimates see
> +# https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
> +{
> +my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
> +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_gmult_rv64i_zbc
> +.type gcm_gmult_rv64i_zbc,\@function
> +gcm_gmult_rv64i_zbc:
> + # Load Xi and bit-reverse it
> + ld $x0, 0($Xi)
> + ld $x1, 8($Xi)
> + @{[brev8_rv64i $x0, $z0, $z1, $z2]}
> + @{[brev8_rv64i $x1, $z0, $z1, $z2]}
> +
> + # Load the key (already bit-reversed)
> + ld $y0, 0($Htable)
> + ld $y1, 8($Htable)
> +
> + # Load the reduction constant
> + la $polymod, Lpolymod
> + lbu $polymod, 0($polymod)
> +
> + # Multiplication (without Karatsuba)
> + @{[clmulh $z3, $x1, $y1]}
> + @{[clmul $z2, $x1, $y1]}
> + @{[clmulh $t1, $x0, $y1]}
> + @{[clmul $z1, $x0, $y1]}
> + xor $z2, $z2, $t1
> + @{[clmulh $t1, $x1, $y0]}
> + @{[clmul $t0, $x1, $y0]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $x0, $y0]}
> + @{[clmul $z0, $x0, $y0]}
> + xor $z1, $z1, $t1
> +
> + # Reduction with clmul
> + @{[clmulh $t1, $z3, $polymod]}
> + @{[clmul $t0, $z3, $polymod]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $z2, $polymod]}
> + @{[clmul $t0, $z2, $polymod]}
> + xor $x1, $z1, $t1
> + xor $x0, $z0, $t0
> +
> + # Bit-reverse Xi back and store it
> + @{[brev8_rv64i $x0, $z0, $z1, $z2]}
> + @{[brev8_rv64i $x1, $z0, $z1, $z2]}
> + sd $x0, 0($Xi)
> + sd $x1, 8($Xi)
> + ret
> +.size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
> +___
> +}
> +
> +{
> +my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
> +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_gmult_rv64i_zbc__zbkb
> +.type gcm_gmult_rv64i_zbc__zbkb,\@function
> +gcm_gmult_rv64i_zbc__zbkb:
> + # Load Xi and bit-reverse it
> + ld $x0, 0($Xi)
> + ld $x1, 8($Xi)
> + @{[brev8 $x0, $x0]}
> + @{[brev8 $x1, $x1]}
> +
> + # Load the key (already bit-reversed)
> + ld $y0, 0($Htable)
> + ld $y1, 8($Htable)
> +
> + # Load the reduction constant
> + la $polymod, Lpolymod
> + lbu $polymod, 0($polymod)
> +
> + # Multiplication (without Karatsuba)
> + @{[clmulh $z3, $x1, $y1]}
> + @{[clmul $z2, $x1, $y1]}
> + @{[clmulh $t1, $x0, $y1]}
> + @{[clmul $z1, $x0, $y1]}
> + xor $z2, $z2, $t1
> + @{[clmulh $t1, $x1, $y0]}
> + @{[clmul $t0, $x1, $y0]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $x0, $y0]}
> + @{[clmul $z0, $x0, $y0]}
> + xor $z1, $z1, $t1
> +
> + # Reduction with clmul
> + @{[clmulh $t1, $z3, $polymod]}
> + @{[clmul $t0, $z3, $polymod]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $z2, $polymod]}
> + @{[clmul $t0, $z2, $polymod]}
> + xor $x1, $z1, $t1
> + xor $x0, $z0, $t0
> +
> + # Bit-reverse Xi back and store it
> + @{[brev8 $x0, $x0]}
> + @{[brev8 $x1, $x1]}
> + sd $x0, 0($Xi)
> + sd $x1, 8($Xi)
> + ret
> +.size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
> +___
> +}
> +
> +################################################################################
> +# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
> +# const u8 *inp, size_t len);
> +# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
> +# const u8 *inp, size_t len);
> +#
> +# input: Xi: current hash value
> +# Htable: copy of H
> +# inp: pointer to input data
> +# len: length of input data in bytes (mutiple of block size)
> +# output: Xi: Xi+1 (next hash value Xi)
> +{
> +my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
> +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_ghash_rv64i_zbc
> +.type gcm_ghash_rv64i_zbc,\@function
> +gcm_ghash_rv64i_zbc:
> + # Load Xi and bit-reverse it
> + ld $x0, 0($Xi)
> + ld $x1, 8($Xi)
> + @{[brev8_rv64i $x0, $z0, $z1, $z2]}
> + @{[brev8_rv64i $x1, $z0, $z1, $z2]}
> +
> + # Load the key (already bit-reversed)
> + ld $y0, 0($Htable)
> + ld $y1, 8($Htable)
> +
> + # Load the reduction constant
> + la $polymod, Lpolymod
> + lbu $polymod, 0($polymod)
> +
> +Lstep:
> + # Load the input data, bit-reverse them, and XOR them with Xi
> + ld $t0, 0($inp)
> + ld $t1, 8($inp)
> + add $inp, $inp, 16
> + add $len, $len, -16
> + @{[brev8_rv64i $t0, $z0, $z1, $z2]}
> + @{[brev8_rv64i $t1, $z0, $z1, $z2]}
> + xor $x0, $x0, $t0
> + xor $x1, $x1, $t1
> +
> + # Multiplication (without Karatsuba)
> + @{[clmulh $z3, $x1, $y1]}
> + @{[clmul $z2, $x1, $y1]}
> + @{[clmulh $t1, $x0, $y1]}
> + @{[clmul $z1, $x0, $y1]}
> + xor $z2, $z2, $t1
> + @{[clmulh $t1, $x1, $y0]}
> + @{[clmul $t0, $x1, $y0]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $x0, $y0]}
> + @{[clmul $z0, $x0, $y0]}
> + xor $z1, $z1, $t1
> +
> + # Reduction with clmul
> + @{[clmulh $t1, $z3, $polymod]}
> + @{[clmul $t0, $z3, $polymod]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $z2, $polymod]}
> + @{[clmul $t0, $z2, $polymod]}
> + xor $x1, $z1, $t1
> + xor $x0, $z0, $t0
> +
> + # Iterate over all blocks
> + bnez $len, Lstep
> +
> + # Bit-reverse final Xi back and store it
> + @{[brev8_rv64i $x0, $z0, $z1, $z2]}
> + @{[brev8_rv64i $x1, $z0, $z1, $z2]}
> + sd $x0, 0($Xi)
> + sd $x1, 8($Xi)
> + ret
> +.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
> +___
> +}
> +
> +{
> +my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
> +my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
> +
> +$code .= <<___;
> +.p2align 3
> +.globl gcm_ghash_rv64i_zbc__zbkb
> +.type gcm_ghash_rv64i_zbc__zbkb,\@function
> +gcm_ghash_rv64i_zbc__zbkb:
> + # Load Xi and bit-reverse it
> + ld $x0, 0($Xi)
> + ld $x1, 8($Xi)
> + @{[brev8 $x0, $x0]}
> + @{[brev8 $x1, $x1]}
> +
> + # Load the key (already bit-reversed)
> + ld $y0, 0($Htable)
> + ld $y1, 8($Htable)
> +
> + # Load the reduction constant
> + la $polymod, Lpolymod
> + lbu $polymod, 0($polymod)
> +
> +Lstep_zkbk:
> + # Load the input data, bit-reverse them, and XOR them with Xi
> + ld $t0, 0($inp)
> + ld $t1, 8($inp)
> + add $inp, $inp, 16
> + add $len, $len, -16
> + @{[brev8 $t0, $t0]}
> + @{[brev8 $t1, $t1]}
> + xor $x0, $x0, $t0
> + xor $x1, $x1, $t1
> +
> + # Multiplication (without Karatsuba)
> + @{[clmulh $z3, $x1, $y1]}
> + @{[clmul $z2, $x1, $y1]}
> + @{[clmulh $t1, $x0, $y1]}
> + @{[clmul $z1, $x0, $y1]}
> + xor $z2, $z2, $t1
> + @{[clmulh $t1, $x1, $y0]}
> + @{[clmul $t0, $x1, $y0]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $x0, $y0]}
> + @{[clmul $z0, $x0, $y0]}
> + xor $z1, $z1, $t1
> +
> + # Reduction with clmul
> + @{[clmulh $t1, $z3, $polymod]}
> + @{[clmul $t0, $z3, $polymod]}
> + xor $z2, $z2, $t1
> + xor $z1, $z1, $t0
> + @{[clmulh $t1, $z2, $polymod]}
> + @{[clmul $t0, $z2, $polymod]}
> + xor $x1, $z1, $t1
> + xor $x0, $z0, $t0
> +
> + # Iterate over all blocks
> + bnez $len, Lstep_zkbk
> +
> + # Bit-reverse final Xi back and store it
> + @{[brev8 $x0, $x0]}
> + @{[brev8 $x1, $x1]}
> + sd $x0, 0($Xi)
> + sd $x1, 8($Xi)
> + ret
> +.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
> +___
> +}
> +
> +$code .= <<___;
> +.p2align 3
> +Lbrev8_const:
> + .dword 0xAAAAAAAAAAAAAAAA
> + .dword 0xCCCCCCCCCCCCCCCC
> + .dword 0xF0F0F0F0F0F0F0F0
> +.size Lbrev8_const,.-Lbrev8_const
> +
> +Lpolymod:
> + .byte 0x87
> +.size Lpolymod,.-Lpolymod
> +___
> +
> +print $code;
> +
> +close STDOUT or die "error closing STDOUT: $!";
> diff --git a/arch/riscv/crypto/riscv.pm b/arch/riscv/crypto/riscv.pm
> new file mode 100644
> index 000000000000..b0c786a13ca0
> --- /dev/null
> +++ b/arch/riscv/crypto/riscv.pm
> @@ -0,0 +1,231 @@
> +#! /usr/bin/env perl
> +# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
> +#
> +# Licensed under the Apache License 2.0 (the "License"). You may not use
> +# this file except in compliance with the License. You can obtain a copy
> +# in the file LICENSE in the source distribution or at
> +# https://www.openssl.org/source/license.html
> +
> +use strict;
> +use warnings;
> +
> +# Set $have_stacktrace to 1 if we have Devel::StackTrace
> +my $have_stacktrace = 0;
> +if (eval {require Devel::StackTrace;1;}) {
> + $have_stacktrace = 1;
> +}
> +
> +my @regs = map("x$_",(0..31));
> +# Mapping from the RISC-V psABI ABI mnemonic names to the register number.
> +my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
> + map("a$_",(0..7)),
> + map("s$_",(2..11)),
> + map("t$_",(3..6))
> +);
> +
> +my %reglookup;
> +@reglookup{@regs} = @regs;
> +@reglookup{@regaliases} = @regs;
> +
> +# Takes a register name, possibly an alias, and converts it to a register index
> +# from 0 to 31
> +sub read_reg {
> + my $reg = lc shift;
> + if (!exists($reglookup{$reg})) {
> + my $trace = "";
> + if ($have_stacktrace) {
> + $trace = Devel::StackTrace->new->as_string;
> + }
> + die("Unknown register ".$reg."\n".$trace);
> + }
> + my $regstr = $reglookup{$reg};
> + if (!($regstr =~ /^x([0-9]+)$/)) {
> + my $trace = "";
> + if ($have_stacktrace) {
> + $trace = Devel::StackTrace->new->as_string;
> + }
> + die("Could not process register ".$reg."\n".$trace);
> + }
> + return $1;
> +}
> +
> +# Helper functions
> +
> +sub brev8_rv64i {
> + # brev8 without `brev8` instruction (only in Zbkb)
> + # Bit-reverses the first argument and needs two scratch registers
> + my $val = shift;
> + my $t0 = shift;
> + my $t1 = shift;
> + my $brev8_const = shift;
> + my $seq = <<___;
> + la $brev8_const, Lbrev8_const
> +
> + ld $t0, 0($brev8_const) # 0xAAAAAAAAAAAAAAAA
> + slli $t1, $val, 1
> + and $t1, $t1, $t0
> + and $val, $val, $t0
> + srli $val, $val, 1
> + or $val, $t1, $val
> +
> + ld $t0, 8($brev8_const) # 0xCCCCCCCCCCCCCCCC
> + slli $t1, $val, 2
> + and $t1, $t1, $t0
> + and $val, $val, $t0
> + srli $val, $val, 2
> + or $val, $t1, $val
> +
> + ld $t0, 16($brev8_const) # 0xF0F0F0F0F0F0F0F0
> + slli $t1, $val, 4
> + and $t1, $t1, $t0
> + and $val, $val, $t0
> + srli $val, $val, 4
> + or $val, $t1, $val
> +___
> + return $seq;
> +}
> +
> +sub sd_rev8_rv64i {
> + # rev8 without `rev8` instruction (only in Zbb or Zbkb)
> + # Stores the given value byte-reversed and needs one scratch register
> + my $val = shift;
> + my $addr = shift;
> + my $off = shift;
> + my $tmp = shift;
> + my $off0 = ($off + 0);
> + my $off1 = ($off + 1);
> + my $off2 = ($off + 2);
> + my $off3 = ($off + 3);
> + my $off4 = ($off + 4);
> + my $off5 = ($off + 5);
> + my $off6 = ($off + 6);
> + my $off7 = ($off + 7);
> + my $seq = <<___;
> + sb $val, $off7($addr)
> + srli $tmp, $val, 8
> + sb $tmp, $off6($addr)
> + srli $tmp, $val, 16
> + sb $tmp, $off5($addr)
> + srli $tmp, $val, 24
> + sb $tmp, $off4($addr)
> + srli $tmp, $val, 32
> + sb $tmp, $off3($addr)
> + srli $tmp, $val, 40
> + sb $tmp, $off2($addr)
> + srli $tmp, $val, 48
> + sb $tmp, $off1($addr)
> + srli $tmp, $val, 56
> + sb $tmp, $off0($addr)
> +___
> + return $seq;
> +}
> +
> +# Scalar crypto instructions
> +
> +sub aes64ds {
> + # Encoding for aes64ds rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0011101_00000_00000_000_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64dsm {
> + # Encoding for aes64dsm rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0011111_00000_00000_000_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64es {
> + # Encoding for aes64es rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0011001_00000_00000_000_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64esm {
> + # Encoding for aes64esm rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0011011_00000_00000_000_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64im {
> + # Encoding for aes64im rd, rs1 instruction on RV64
> + # XXXXXXXXXXXX_ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b001100000000_00000_001_00000_0010011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + return ".word ".($template | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64ks1i {
> + # Encoding for aes64ks1i rd, rs1, rnum instruction on RV64
> + # XXXXXXXX_rnum_ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b00110001_0000_00000_001_00000_0010011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rnum = shift;
> + return ".word ".($template | ($rnum << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub aes64ks2 {
> + # Encoding for aes64ks2 rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0111111_00000_00000_000_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub brev8 {
> + # brev8 rd, rs
> + my $template = 0b011010000111_00000_101_00000_0010011;
> + my $rd = read_reg shift;
> + my $rs = read_reg shift;
> + return ".word ".($template | ($rs << 15) | ($rd << 7));
> +}
> +
> +sub clmul {
> + # Encoding for clmul rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0000101_00000_00000_001_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub clmulh {
> + # Encoding for clmulh rd, rs1, rs2 instruction on RV64
> + # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
> + my $template = 0b0000101_00000_00000_011_00000_0110011;
> + my $rd = read_reg shift;
> + my $rs1 = read_reg shift;
> + my $rs2 = read_reg shift;
> + return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
> +}
> +
> +sub rev8 {
> + # Encoding for rev8 rd, rs instruction on RV64
> + # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX
> + my $template = 0b011010111000_00000_101_00000_0010011;
> + my $rd = read_reg shift;
> + my $rs = read_reg shift;
> + return ".word ".($template | ($rs << 15) | ($rd << 7));
> +}
> +
> +1;
> --
> 2.39.0
>

Thanks,
Huck