2020-01-27 16:58:32

by Andrey Smirnov

[permalink] [raw]
Subject: [PATCH v7 5/9] crypto: caam - simplify RNG implementation

Rework CAAM RNG implementation as follows:

- Make use of the fact that HWRNG supports partial reads and will
handle such cases gracefully by removing recursion in caam_read()

- Convert blocking caam_read() codepath to do a single blocking job
read directly into requested buffer, bypassing any intermediary
buffers

- Convert async caam_read() codepath into a simple single
reader/single writer FIFO use-case, thus simplifying concurrency
handling and delegating buffer read/write position management to KFIFO
subsystem.

- Leverage the same low level RNG data extraction code for both async
and blocking caam_read() scenarios, get rid of the shared job
descriptor and make non-shared one as a simple as possible (just
HEADER + ALGORITHM OPERATION + FIFO STORE)

- Split private context from DMA related memory, so that the former
could be allocated without GFP_DMA.

NOTE: On its face value this commit decreased throughput numbers
reported by

dd if=/dev/hwrng of=/dev/null bs=1 count=100K [iflag=nonblock]

by about 15%, however commits that enable prediction resistance and
limit JR total size impact the performance so much and move the
bottleneck such as to make this regression irrelevant.

NOTE: On the bright side, this commit reduces RNG in kernel DMA buffer
memory usage from 2 x RN_BUF_SIZE (~256K) to 32K.

Signed-off-by: Andrey Smirnov <[email protected]>
Cc: Chris Healy <[email protected]>
Cc: Lucas Stach <[email protected]>
Cc: Horia Geantă <[email protected]>
Cc: Herbert Xu <[email protected]>
Cc: Iuliana Prodan <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
---
drivers/crypto/caam/caamrng.c | 321 ++++++++++++----------------------
1 file changed, 107 insertions(+), 214 deletions(-)

diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 47b15c25b66f..cb498186b9b9 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -7,35 +7,12 @@
*
* Based on caamalg.c crypto API driver.
*
- * relationship between job descriptors to shared descriptors:
- *
- * --------------- --------------
- * | JobDesc #0 |-------------------->| ShareDesc |
- * | *(buffer 0) | |------------->| (generate) |
- * --------------- | | (move) |
- * | | (store) |
- * --------------- | --------------
- * | JobDesc #1 |------|
- * | *(buffer 1) |
- * ---------------
- *
- * A job desc looks like this:
- *
- * ---------------------
- * | Header |
- * | ShareDesc Pointer |
- * | SEQ_OUT_PTR |
- * | (output buffer) |
- * ---------------------
- *
- * The SharedDesc never changes, and each job descriptor points to one of two
- * buffers for each device, from which the data will be copied into the
- * requested destination
*/

#include <linux/hw_random.h>
#include <linux/completion.h>
#include <linux/atomic.h>
+#include <linux/kfifo.h>

#include "compat.h"

@@ -45,38 +22,34 @@
#include "jr.h"
#include "error.h"

+/* length of descriptors */
+#define CAAM_RNG_MAX_FIFO_STORE_SIZE U16_MAX
+
+#define CAAM_RNG_FIFO_LEN SZ_32K /* Must be a multiple of 2 */
+
/*
- * Maximum buffer size: maximum number of random, cache-aligned bytes that
- * will be generated and moved to seq out ptr (extlen not allowed)
+ * See caam_init_desc()
*/
-#define RN_BUF_SIZE (0xffff / L1_CACHE_BYTES * \
- L1_CACHE_BYTES)
+#define CAAM_RNG_DESC_LEN (CAAM_CMD_SZ + \
+ CAAM_CMD_SZ + \
+ CAAM_CMD_SZ + CAAM_PTR_SZ_MAX)

-/* length of descriptors */
-#define DESC_JOB_O_LEN (CAAM_CMD_SZ * 2 + CAAM_PTR_SZ_MAX * 2)
-#define DESC_RNG_LEN (3 * CAAM_CMD_SZ)
-
-/* Buffer, its dma address and lock */
-struct buf_data {
- u8 buf[RN_BUF_SIZE] ____cacheline_aligned;
- dma_addr_t addr;
- struct completion filled;
- u32 hw_desc[DESC_JOB_O_LEN];
-#define BUF_NOT_EMPTY 0
-#define BUF_EMPTY 1
-#define BUF_PENDING 2 /* Empty, but with job pending --don't submit another */
- atomic_t empty;
+typedef u8 caam_rng_desc[CAAM_RNG_DESC_LEN];
+
+enum {
+ DESC_ASYNC,
+ DESC_SYNC,
+ DESC_NUM,
};

/* rng per-device context */
struct caam_rng_ctx {
struct hwrng rng;
struct device *jrdev;
- dma_addr_t sh_desc_dma;
- u32 sh_desc[DESC_RNG_LEN];
- unsigned int cur_buf_idx;
- int current_buf;
- struct buf_data bufs[2];
+ struct device *ctrldev;
+ caam_rng_desc *desc;
+ struct work_struct worker;
+ struct kfifo fifo;
};

static struct caam_rng_ctx *to_caam_rng_ctx(struct hwrng *r)
@@ -84,228 +57,146 @@ static struct caam_rng_ctx *to_caam_rng_ctx(struct hwrng *r)
return container_of(r, struct caam_rng_ctx, rng);
}

-static inline void rng_unmap_buf(struct device *jrdev, struct buf_data *bd)
+static void caam_rng_done(struct device *jrdev, u32 *desc, u32 err,
+ void *context)
{
- if (bd->addr)
- dma_unmap_single(jrdev, bd->addr, RN_BUF_SIZE,
- DMA_FROM_DEVICE);
-}
-
-static inline void rng_unmap_ctx(struct caam_rng_ctx *ctx)
-{
- struct device *jrdev = ctx->jrdev;
-
- if (ctx->sh_desc_dma)
- dma_unmap_single(jrdev, ctx->sh_desc_dma,
- desc_bytes(ctx->sh_desc), DMA_TO_DEVICE);
- rng_unmap_buf(jrdev, &ctx->bufs[0]);
- rng_unmap_buf(jrdev, &ctx->bufs[1]);
-}
-
-static void rng_done(struct device *jrdev, u32 *desc, u32 err, void *context)
-{
- struct buf_data *bd;
-
- bd = container_of(desc, struct buf_data, hw_desc[0]);
+ struct completion *done = context;

if (err)
caam_jr_strstatus(jrdev, err);

- atomic_set(&bd->empty, BUF_NOT_EMPTY);
- complete(&bd->filled);
-
- /* Buffer refilled, invalidate cache */
- dma_sync_single_for_cpu(jrdev, bd->addr, RN_BUF_SIZE, DMA_FROM_DEVICE);
-
- print_hex_dump_debug("rng refreshed buf@: ", DUMP_PREFIX_ADDRESS, 16, 4,
- bd->buf, RN_BUF_SIZE, 1);
+ complete(done);
}

-static inline int submit_job(struct caam_rng_ctx *ctx, int to_current)
+static u32 *caam_init_desc(u32 *desc, dma_addr_t dst_dma, int len)
{
- struct buf_data *bd = &ctx->bufs[!(to_current ^ ctx->current_buf)];
- struct device *jrdev = ctx->jrdev;
- u32 *desc = bd->hw_desc;
- int err;
+ init_job_desc(desc, 0); /* + 1 cmd_sz */
+ /* Generate random bytes: + 1 cmd_sz */
+ append_operation(desc, OP_ALG_ALGSEL_RNG | OP_TYPE_CLASS1_ALG);
+ /* Store bytes */
+ append_fifo_store(desc, dst_dma, len, FIFOST_TYPE_RNGSTORE);

- dev_dbg(jrdev, "submitting job %d\n", !(to_current ^ ctx->current_buf));
- init_completion(&bd->filled);
- err = caam_jr_enqueue(jrdev, desc, rng_done, ctx);
- if (err)
- complete(&bd->filled); /* don't wait on failed job*/
- else
- atomic_inc(&bd->empty); /* note if pending */
+ print_hex_dump_debug("rng job desc@: ", DUMP_PREFIX_ADDRESS,
+ 16, 4, desc, desc_bytes(desc), 1);

- return err;
+ return desc;
}

-static int caam_read(struct hwrng *rng, void *data, size_t max, bool wait)
+static int caam_rng_read_one(struct device *jrdev,
+ void *dst, int len,
+ void *desc,
+ struct completion *done)
{
- struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
- struct buf_data *bd = &ctx->bufs[ctx->current_buf];
- int next_buf_idx, copied_idx;
+ dma_addr_t dst_dma;
int err;

- if (atomic_read(&bd->empty)) {
- /* try to submit job if there wasn't one */
- if (atomic_read(&bd->empty) == BUF_EMPTY) {
- err = submit_job(ctx, 1);
- /* if can't submit job, can't even wait */
- if (err)
- return 0;
- }
- /* no immediate data, so exit if not waiting */
- if (!wait)
- return 0;
-
- /* waiting for pending job */
- if (atomic_read(&bd->empty))
- wait_for_completion(&bd->filled);
- }
-
- next_buf_idx = ctx->cur_buf_idx + max;
- dev_dbg(ctx->jrdev, "%s: start reading at buffer %d, idx %d\n",
- __func__, ctx->current_buf, ctx->cur_buf_idx);
+ len = min_t(int, len, CAAM_RNG_MAX_FIFO_STORE_SIZE);

- /* if enough data in current buffer */
- if (next_buf_idx < RN_BUF_SIZE) {
- memcpy(data, bd->buf + ctx->cur_buf_idx, max);
- ctx->cur_buf_idx = next_buf_idx;
- return max;
+ dst_dma = dma_map_single(jrdev, dst, len, DMA_FROM_DEVICE);
+ if (dma_mapping_error(jrdev, dst_dma)) {
+ dev_err(jrdev, "unable to map destination memory\n");
+ return -ENOMEM;
}

- /* else, copy what's left... */
- copied_idx = RN_BUF_SIZE - ctx->cur_buf_idx;
- memcpy(data, bd->buf + ctx->cur_buf_idx, copied_idx);
- ctx->cur_buf_idx = 0;
- atomic_set(&bd->empty, BUF_EMPTY);
-
- /* ...refill... */
- submit_job(ctx, 1);
+ init_completion(done);
+ err = caam_jr_enqueue(jrdev,
+ caam_init_desc(desc, dst_dma, len),
+ caam_rng_done, done);
+ if (!err)
+ wait_for_completion(done);

- /* and use next buffer */
- ctx->current_buf = !ctx->current_buf;
- dev_dbg(ctx->jrdev, "switched to buffer %d\n", ctx->current_buf);
+ dma_unmap_single(jrdev, dst_dma, len, DMA_FROM_DEVICE);

- /* since there already is some data read, don't wait */
- return copied_idx + caam_read(rng, data + copied_idx,
- max - copied_idx, false);
+ return err ?: len;
}

-static inline int rng_create_sh_desc(struct caam_rng_ctx *ctx)
+static void caam_rng_fill_async(struct caam_rng_ctx *ctx)
{
- struct device *jrdev = ctx->jrdev;
- u32 *desc = ctx->sh_desc;
-
- init_sh_desc(desc, HDR_SHARE_SERIAL);
-
- /* Generate random bytes */
- append_operation(desc, OP_ALG_ALGSEL_RNG | OP_TYPE_CLASS1_ALG);
-
- /* Store bytes */
- append_seq_fifo_store(desc, RN_BUF_SIZE, FIFOST_TYPE_RNGSTORE);
-
- ctx->sh_desc_dma = dma_map_single(jrdev, desc, desc_bytes(desc),
- DMA_TO_DEVICE);
- if (dma_mapping_error(jrdev, ctx->sh_desc_dma)) {
- dev_err(jrdev, "unable to map shared descriptor\n");
- return -ENOMEM;
- }
-
- print_hex_dump_debug("rng shdesc@: ", DUMP_PREFIX_ADDRESS, 16, 4,
- desc, desc_bytes(desc), 1);
+ struct scatterlist sg[1];
+ struct completion done;
+ int len, nents;
+
+ sg_init_table(sg, ARRAY_SIZE(sg));
+ nents = kfifo_dma_in_prepare(&ctx->fifo, sg, ARRAY_SIZE(sg),
+ CAAM_RNG_FIFO_LEN);
+ if (!nents)
+ return;
+
+ len = caam_rng_read_one(ctx->jrdev, sg_virt(&sg[0]),
+ sg[0].length,
+ &ctx->desc[DESC_ASYNC],
+ &done);
+ if (len < 0)
+ return;
+
+ kfifo_dma_in_finish(&ctx->fifo, len);
+}

- return 0;
+static void caam_rng_worker(struct work_struct *work)
+{
+ struct caam_rng_ctx *ctx = container_of(work, struct caam_rng_ctx,
+ worker);
+ caam_rng_fill_async(ctx);
}

-static inline int rng_create_job_desc(struct caam_rng_ctx *ctx, int buf_id)
+static int caam_read(struct hwrng *rng, void *dst, size_t max, bool wait)
{
- struct device *jrdev = ctx->jrdev;
- struct buf_data *bd = &ctx->bufs[buf_id];
- u32 *desc = bd->hw_desc;
- int sh_len = desc_len(ctx->sh_desc);
+ struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
+ int out;

- init_job_desc_shared(desc, ctx->sh_desc_dma, sh_len, HDR_SHARE_DEFER |
- HDR_REVERSE);
+ if (wait) {
+ struct completion done;

- bd->addr = dma_map_single(jrdev, bd->buf, RN_BUF_SIZE, DMA_FROM_DEVICE);
- if (dma_mapping_error(jrdev, bd->addr)) {
- dev_err(jrdev, "unable to map dst\n");
- return -ENOMEM;
+ return caam_rng_read_one(ctx->jrdev, dst, max,
+ ctx->desc[DESC_SYNC], &done);
}

- append_seq_out_ptr_intlen(desc, bd->addr, RN_BUF_SIZE, 0);
-
- print_hex_dump_debug("rng job desc@: ", DUMP_PREFIX_ADDRESS, 16, 4,
- desc, desc_bytes(desc), 1);
+ out = kfifo_out(&ctx->fifo, dst, max);
+ if (kfifo_len(&ctx->fifo) <= CAAM_RNG_FIFO_LEN / 2)
+ schedule_work(&ctx->worker);

- return 0;
+ return out;
}

static void caam_cleanup(struct hwrng *rng)
{
struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
- int i;
- struct buf_data *bd;
-
- for (i = 0; i < 2; i++) {
- bd = &ctx->bufs[i];
- if (atomic_read(&bd->empty) == BUF_PENDING)
- wait_for_completion(&bd->filled);
- }

- rng_unmap_ctx(ctx);
+ flush_work(&ctx->worker);
caam_jr_free(ctx->jrdev);
+ kfifo_free(&ctx->fifo);
}

-static int caam_init_buf(struct caam_rng_ctx *ctx, int buf_id)
+static int caam_init(struct hwrng *rng)
{
- struct buf_data *bd = &ctx->bufs[buf_id];
+ struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
int err;

- err = rng_create_job_desc(ctx, buf_id);
- if (err)
- return err;
-
- atomic_set(&bd->empty, BUF_EMPTY);
- submit_job(ctx, buf_id == ctx->current_buf);
- wait_for_completion(&bd->filled);
+ ctx->desc = devm_kcalloc(ctx->ctrldev, DESC_NUM, sizeof(*ctx->desc),
+ GFP_DMA | GFP_KERNEL);
+ if (!ctx->desc)
+ return -ENOMEM;

- return 0;
-}
+ if (kfifo_alloc(&ctx->fifo, CAAM_RNG_FIFO_LEN, GFP_DMA | GFP_KERNEL))
+ return -ENOMEM;

-static int caam_init(struct hwrng *rng)
-{
- struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
- int err;
+ INIT_WORK(&ctx->worker, caam_rng_worker);

ctx->jrdev = caam_jr_alloc();
err = PTR_ERR_OR_ZERO(ctx->jrdev);
if (err) {
+ kfifo_free(&ctx->fifo);
pr_err("Job Ring Device allocation for transform failed\n");
return err;
}

- err = rng_create_sh_desc(ctx);
- if (err)
- goto free_jrdev;
-
- ctx->current_buf = 0;
- ctx->cur_buf_idx = 0;
-
- err = caam_init_buf(ctx, 0);
- if (err)
- goto free_jrdev;
-
- err = caam_init_buf(ctx, 1);
- if (err)
- goto free_jrdev;
+ /*
+ * Fill async buffer to have early randomness data for
+ * hw_random
+ */
+ caam_rng_fill_async(ctx);

return 0;
-
-free_jrdev:
- caam_jr_free(ctx->jrdev);
- return err;
}

int caam_rng_init(struct device *ctrldev);
@@ -335,10 +226,12 @@ int caam_rng_init(struct device *ctrldev)
if (!devres_open_group(ctrldev, caam_rng_init, GFP_KERNEL))
return -ENOMEM;

- ctx = devm_kzalloc(ctrldev, sizeof(*ctx), GFP_DMA | GFP_KERNEL);
+ ctx = devm_kzalloc(ctrldev, sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;

+ ctx->ctrldev = ctrldev;
+
ctx->rng.name = "rng-caam";
ctx->rng.init = caam_init;
ctx->rng.cleanup = caam_cleanup;
--
2.21.0


2020-02-24 17:17:18

by Andrey Smirnov

[permalink] [raw]
Subject: Re: [PATCH v7 5/9] crypto: caam - simplify RNG implementation

On Wed, Feb 12, 2020 at 5:20 AM Horia Geanta <[email protected]> wrote:
>
> On 1/27/2020 6:57 PM, Andrey Smirnov wrote:
> > Rework CAAM RNG implementation as follows:
> >
> > - Make use of the fact that HWRNG supports partial reads and will
> > handle such cases gracefully by removing recursion in caam_read()
> >
> > - Convert blocking caam_read() codepath to do a single blocking job
> > read directly into requested buffer, bypassing any intermediary
> > buffers
> >
> > - Convert async caam_read() codepath into a simple single
> > reader/single writer FIFO use-case, thus simplifying concurrency
> > handling and delegating buffer read/write position management to KFIFO
> > subsystem.
> >
> > - Leverage the same low level RNG data extraction code for both async
> > and blocking caam_read() scenarios, get rid of the shared job
> > descriptor and make non-shared one as a simple as possible (just
> > HEADER + ALGORITHM OPERATION + FIFO STORE)
> >
> > - Split private context from DMA related memory, so that the former
> > could be allocated without GFP_DMA.
> >
> > NOTE: On its face value this commit decreased throughput numbers
> > reported by
> >
> > dd if=/dev/hwrng of=/dev/null bs=1 count=100K [iflag=nonblock]
> >
> > by about 15%, however commits that enable prediction resistance and
> Running dd as mentioned above, on a i.MX8MM board I see:
> ~ 20% decrease in non-blocking case (525 kB/s vs. 662 kB/s)
> ~ 75% decrease in blocking case (170 kB/s vs. 657 kB/s)
>
> bs=1 is a bit drastic.
> Using bs=16 the numbers look better in terms of overall speed,
> however the relative degradation is still there:
> ~ 66% decrease in blocking case (3.5 MB/s vs. 10.1 MB/s)
>
> > limit JR total size impact the performance so much and move the
> > bottleneck such as to make this regression irrelevant.
> >
> Yes, performance is greatly impacted by moving from a DRBG configuration
> to a TRNG one.
>
> The speed that I get with this patch set (1.3 kB/s)
> is ~ 20% lower than theoretical output (1.583 kB/s) (see below).
> Seeing this and also the relative decrease in case of DRBG
> makes me wonder whether the SW overhead could be lowered.
>
> Theoretical TRNG output speed in this configuration
> can be computed as:
> Speed = (SZ x CAAM_CLK_FREQ) / (RTSDCTL[ENT_DLY] x RTSDCTL[SAMP_SIZE]) [bps]
>
> SZ is sample taken from the DRBG, b/w two consecutive reseedings.
> As previously discussed, this is limited to 128 bits (16 bytes),
> such that the DRBG behaves as a TRNG.
>
> If:
> -CAAM_CLK_FREQ = 166 MHz (as for i.MXM*)
> -RTSDCTL[ENT_DLY] = 3200 clocks (default / POR value)
> -RTSDCTL[SAMP_SIZE] = 512 (recommended; default / POR value is 2500)
> then theoretical speed is 1.583 kB/s.
>
> > @@ -45,38 +22,34 @@
> > #include "jr.h"
> > #include "error.h"
> >
> > +/* length of descriptors */
> This comment is misplaced, length of descriptors (CAAM_RNG_DESC_LEN)
> is further below.
>
> > +#define CAAM_RNG_MAX_FIFO_STORE_SIZE U16_MAX
> > +
> > +#define CAAM_RNG_FIFO_LEN SZ_32K /* Must be a multiple of 2 */
> > +
> > /*
> > - * Maximum buffer size: maximum number of random, cache-aligned bytes that
> > - * will be generated and moved to seq out ptr (extlen not allowed)
> > + * See caam_init_desc()
> > */
> > -#define RN_BUF_SIZE (0xffff / L1_CACHE_BYTES * \
> > - L1_CACHE_BYTES)
> > +#define CAAM_RNG_DESC_LEN (CAAM_CMD_SZ + \
> > + CAAM_CMD_SZ + \
> > + CAAM_CMD_SZ + CAAM_PTR_SZ_MAX)
>
> > +typedef u8 caam_rng_desc[CAAM_RNG_DESC_LEN];
> Is this really necessary?
>

Will drop in v8.

> > -static int caam_read(struct hwrng *rng, void *data, size_t max, bool wait)
> > +static int caam_rng_read_one(struct device *jrdev,
> > + void *dst, int len,
> > + void *desc,
> > + struct completion *done)
> [...]
> > + len = min_t(int, len, CAAM_RNG_MAX_FIFO_STORE_SIZE);
> For the blocking case, i.e. caam_read() -> caam_rng_read_one(),
> "len" is at least 32B - cf. include/linux/hw_random.h:
> * @read: New API. drivers can fill up to max bytes of data
> * into the buffer. The buffer is aligned for any type
> * and max is a multiple of 4 and >= 32 bytes.
>
> For reducing the SW overhead, it might be worth optimizing this path.
> For example, considering
> min_t(int, len, CAAM_RNG_MAX_FIFO_STORE_SIZE) = CAAM_RNG_MAX_FIFO_STORE_SIZE

This isn't true until next commit. Here CAAM_RNG_MAX_FIFO_STORE_SIZE
is still 32K, so "len" is going to be smaller. I'll update the code to
assume fixed length once CAAM_RNG_MAX_FIFO_STORE_SIZE becomes 16
bytes.

> this means length is fixed, thus also ctx->desc[DESC_SYNC] descriptor is fixed
> and its generation could be moved out of the hot path.

That descriptor also includes DMA address of a buffer we are given, so
the descriptor is never fixed.

Thanks,
Andrey Smirnov