2024-02-04 08:52:27

by John Sanpe

[permalink] [raw]
Subject: [PATCH] lib/bch.c: increase bitrev single conversion length

Optimized the performance of the three functions (load_ecc8 store_ecc8
and bch_encode) using a larger calculation length.

Signed-off-by: John Sanpe <[email protected]>
---
lib/bch.c | 87 +++++++++++++++++++++++++++++++------------------------
1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/lib/bch.c b/lib/bch.c
index 5f71fd76eca8..e9a9517e7b1b 100644
--- a/lib/bch.c
+++ b/lib/bch.c
@@ -73,6 +73,7 @@
#include <linux/bitops.h>
#include <linux/bitrev.h>
#include <asm/byteorder.h>
+#include <asm/unaligned.h>
#include <linux/bch.h>

#if defined(CONFIG_BCH_CONST_PARAMS)
@@ -115,14 +116,6 @@ struct gf_poly_deg1 {
unsigned int c[2];
};

-static u8 swap_bits(struct bch_control *bch, u8 in)
-{
- if (!bch->swap_bits)
- return in;
-
- return bitrev8(in);
-}
-
/*
* same as bch_encode(), but process input data one byte at a time
*/
@@ -135,7 +128,10 @@ static void bch_encode_unaligned(struct bch_control *bch,
const int l = BCH_ECC_WORDS(bch)-1;

while (len--) {
- u8 tmp = swap_bits(bch, *data++);
+ u8 tmp = *data++;
+
+ if (bch->swap_bits)
+ tmp = bitrev8(tmp);

p = bch->mod8_tab + (l+1)*(((ecc[0] >> 24)^(tmp)) & 0xff);

@@ -152,20 +148,27 @@ static void bch_encode_unaligned(struct bch_control *bch,
static void load_ecc8(struct bch_control *bch, uint32_t *dst,
const uint8_t *src)
{
- uint8_t pad[4] = {0, 0, 0, 0};
unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
+ uint32_t value, bytes = 0;
+
+ for (i = 0; i < nwords; i++, src += 4) {
+ if (!bch->swap_bits)
+ value = get_unaligned_be32(src);
+ else {
+ value = get_unaligned_le32(src);
+ value = bitrev32(value);
+ }
+ dst[i] = value;
+ }

- for (i = 0; i < nwords; i++, src += 4)
- dst[i] = ((u32)swap_bits(bch, src[0]) << 24) |
- ((u32)swap_bits(bch, src[1]) << 16) |
- ((u32)swap_bits(bch, src[2]) << 8) |
- swap_bits(bch, src[3]);
-
- memcpy(pad, src, BCH_ECC_BYTES(bch)-4*nwords);
- dst[nwords] = ((u32)swap_bits(bch, pad[0]) << 24) |
- ((u32)swap_bits(bch, pad[1]) << 16) |
- ((u32)swap_bits(bch, pad[2]) << 8) |
- swap_bits(bch, pad[3]);
+ memcpy(&bytes, src, BCH_ECC_BYTES(bch)-4*nwords);
+ if (!bch->swap_bits)
+ value = be32_to_cpup((__be32 *)&bytes);
+ else {
+ value = le32_to_cpup((__le32 *)&bytes);
+ value = bitrev32(value);
+ }
+ dst[nwords] = value;
}

/*
@@ -174,20 +177,27 @@ static void load_ecc8(struct bch_control *bch, uint32_t *dst,
static void store_ecc8(struct bch_control *bch, uint8_t *dst,
const uint32_t *src)
{
- uint8_t pad[4];
unsigned int i, nwords = BCH_ECC_WORDS(bch)-1;
+ uint32_t value, bytes;
+
+ for (i = 0; i < nwords; i++, dst += 4) {
+ value = src[i];
+ if (!bch->swap_bits)
+ put_unaligned_be32(value, dst);
+ else {
+ value = bitrev32(value);
+ put_unaligned_le32(value, dst);
+ }
+ }

- for (i = 0; i < nwords; i++) {
- *dst++ = swap_bits(bch, src[i] >> 24);
- *dst++ = swap_bits(bch, src[i] >> 16);
- *dst++ = swap_bits(bch, src[i] >> 8);
- *dst++ = swap_bits(bch, src[i]);
+ value = src[nwords];
+ if (!bch->swap_bits)
+ *(__be32 *)&bytes = cpu_to_be32(value);
+ else {
+ value = bitrev32(value);
+ *(__le32 *)&bytes = cpu_to_le32(value);
}
- pad[0] = swap_bits(bch, src[nwords] >> 24);
- pad[1] = swap_bits(bch, src[nwords] >> 16);
- pad[2] = swap_bits(bch, src[nwords] >> 8);
- pad[3] = swap_bits(bch, src[nwords]);
- memcpy(dst, pad, BCH_ECC_BYTES(bch)-4*nwords);
+ memcpy(dst, &bytes, BCH_ECC_BYTES(bch)-4*nwords);
}

/**
@@ -257,12 +267,13 @@ void bch_encode(struct bch_control *bch, const uint8_t *data,
*/
while (mlen--) {
/* input data is read in big-endian format */
- w = cpu_to_be32(*pdata++);
- if (bch->swap_bits)
- w = (u32)swap_bits(bch, w) |
- ((u32)swap_bits(bch, w >> 8) << 8) |
- ((u32)swap_bits(bch, w >> 16) << 16) |
- ((u32)swap_bits(bch, w >> 24) << 24);
+ if (!bch->swap_bits)
+ w = be32_to_cpup((__be32 *)pdata++);
+ else {
+ w = le32_to_cpup((__le32 *)pdata++);
+ w = bitrev32(w);
+ }
+
w ^= r[0];
p0 = tab0 + (l+1)*((w >> 0) & 0xff);
p1 = tab1 + (l+1)*((w >> 8) & 0xff);
--
2.43.0



2024-02-04 09:09:45

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] lib/bch.c: increase bitrev single conversion length

On Sun, 4 Feb 2024 at 08:52, John Sanpe <[email protected]> wrote:
>
> Optimized the performance of the three functions (load_ecc8 store_ecc8
> and bch_encode) using a larger calculation length.

Honestly, with any optimization, you should quote performance numbers.

Also, it's questionable how meaningful this is, considering that most
architectures dop the bit swap with a byte lookup, and the 32-bit bit
swap is just four byte lookups. For all we know, this only makes
things worse.

Finally, if you actually want to do things in bigger chunks, that
->swap_bits conditional should probably be moved out of the loops, not
just have that questionable 8->32 bit expansion.

Linus