Hi all,
I've been looking into a CAAM RNG issue for a while, where I could need
some input from people knowing the CAAM hardware better than I do.
Basically the issue is that on some i.MX6 units the RNG functionality
sometimes fails with this error:
caam_jr 2101000.jr0: 20003c5b: CCB: desc idx 60: RNG: Hardware error.
I can tell that it is related to the entropy delay. On all failing
units the RNG4 gets instantiated with the default entropy delay of
3200. If I dial up the delay to 3600 or 4000 the RNG works reliably. As
a negative test I changed the initial delay to 400. With this change
all units are able to successfully instantiate the RNG handles at an
entropy delay of 2000 or 2400, but then reliably fail at getting random
data with the error shown above. I guess the issue is related to
prediction resistance on the handles, which causes the PRNG to be re-
seeded from the TRNG fairly often.
Now I don't have a good idea on how to arrive at a reliably working
entropy delay setting, as apparently the simple "are we able to
instantiate the handle" check is not enough to actually guarantee a
working RNG setup. Any suggestions?
Regards,
Lucas
On 12/14/2020 9:00 PM, Lucas Stach wrote:
> Hi all,
>
> I've been looking into a CAAM RNG issue for a while, where I could need
> some input from people knowing the CAAM hardware better than I do.
> Basically the issue is that on some i.MX6 units the RNG functionality
> sometimes fails with this error:
> caam_jr 2101000.jr0: 20003c5b: CCB: desc idx 60: RNG: Hardware error.
>
> I can tell that it is related to the entropy delay. On all failing
> units the RNG4 gets instantiated with the default entropy delay of
> 3200. If I dial up the delay to 3600 or 4000 the RNG works reliably. As
> a negative test I changed the initial delay to 400. With this change
> all units are able to successfully instantiate the RNG handles at an
> entropy delay of 2000 or 2400, but then reliably fail at getting random
> data with the error shown above. I guess the issue is related to
> prediction resistance on the handles, which causes the PRNG to be re-
> seeded from the TRNG fairly often.
>
> Now I don't have a good idea on how to arrive at a reliably working
> entropy delay setting, as apparently the simple "are we able to
> instantiate the handle" check is not enough to actually guarantee a
> working RNG setup. Any suggestions?
>
The successful instantiation of the RNG state handle(s) means that
the HW self-tests passed, but this doesn't mean RNG will work flawlessly.
A properly configured RNG should have a certain (very low) failure rate.
The logic in the caam rng driver is not checking this rate, since it's running
only once with a given configuration.
OTOH properly checking the RNG configuration would take some time, so it would
be better to run it offline. The "characterization" should also account for
temperature, voltage and process (fixed for a given SoC).
From this perspective, the caam rng driver should be updated to statically
configure the RNG with these offline-determined parameters.
Ideally we'd be able to use a single set of parameters to cover all SoCs
that have the same IP (RNG4 TRNG).
Unfortunately we're not there yet.
The situation became more visible after changing the caam rng driver to reseed
the PRNG before every request (practically making the PRNG function like a TRNG,
a hwrng framework requirement), since the HW self-tests are now running more
often then before.
Some questions that would give me more details about the exact issue you
and Robert are facing:
1. What SoC exactly are you running on?
2. How fast and how often is the RNG hardware error occurring?
Does this happen at boot time, only when stressing /dev/hwrng etc.?
3. Try dumping some of the RNG registers using below patch:
-- >8 --
Subject: [PATCH] crypto: caam - rng debugging
Dump RNG registers at hwrng.init time and in case descriptor returns
RNG HW error.
Signed-off-by: Horia Geantă <[email protected]>
---
drivers/crypto/caam/caamrng.c | 9 ++++++++-
drivers/crypto/caam/ctrl.c | 29 +++++++++++++++++++++++++++++
drivers/crypto/caam/ctrl.h | 2 ++
drivers/crypto/caam/regs.h | 5 ++++-
4 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 77d048dfe5d0..fc2192183696 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -16,6 +16,7 @@
#include "compat.h"
+#include "ctrl.h"
#include "regs.h"
#include "intern.h"
#include "desc_constr.h"
@@ -57,9 +58,12 @@ static void caam_rng_done(struct device *jrdev, u32 *desc, u32 err,
{
struct caam_rng_job_ctx *jctx = context;
- if (err)
+ if (err) {
*jctx->err = caam_jr_strstatus(jrdev, err);
+ caam_dump_rng_regs(jrdev);
+ }
+
complete(jctx->done);
}
@@ -199,6 +203,9 @@ static int caam_init(struct hwrng *rng)
return err;
}
+ dev_dbg(ctx->jrdev, "CAAM RNG - register status at hwrng.init time\n");
+ caam_dump_rng_regs(ctx->jrdev);
+
/*
* Fill async buffer to have early randomness data for
* hw_random
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index ca0361b2dbb0..52db32b599aa 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -27,6 +27,35 @@ EXPORT_SYMBOL(caam_dpaa2);
#include "qi.h"
#endif
+void caam_dump_rng_regs(struct device *jrdev)
+{
+ struct device *ctrldev = jrdev->parent;
+ struct caam_drv_private *ctrlpriv = dev_get_drvdata(ctrldev);
+ struct caam_ctrl __iomem *ctrl;
+ struct rng4tst __iomem *r4tst;
+ u32 rtmctl;
+
+ dev_dbg(jrdev, "RNG register dump:\n");
+
+ ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl;
+ r4tst = &ctrl->r4tst[0];
+
+ dev_dbg(jrdev, "\trdsta = 0x%08x\n", rd_reg32(&r4tst->rdsta));
+
+ rtmctl = rd_reg32(&r4tst->rtmctl);
+ dev_dbg(jrdev, "\trtmctl = 0x%08x\n", rtmctl);
+ dev_dbg(jrdev, "\trtstatus = 0x%08x\n", rd_reg32(&r4tst->rtstatus));
+
+ /* Group of registers that can be read only when RTMCTL[PRGM]=1 */
+ clrsetbits_32(&r4tst->rtmctl, 0, RTMCTL_PRGM | RTMCTL_ACC);
+ dev_dbg(jrdev, "\trtscmisc = 0x%08x\n", rd_reg32(&r4tst->rtscmisc));
+ dev_dbg(jrdev, "\trtfrqmin = 0x%08x\n", rd_reg32(&r4tst->rtfrqmin));
+ dev_dbg(jrdev, "\trtfrqmax = 0x%08x\n", rd_reg32(&r4tst->rtfrqmax));
+ clrsetbits_32(&r4tst->rtmctl, RTMCTL_PRGM | RTMCTL_ACC, RTMCTL_ERR);
+
+}
+EXPORT_SYMBOL(caam_dump_rng_regs);
+
/*
* Descriptor to instantiate RNG State Handle 0 in normal mode and
* load the JDKEK, TDKEK and TDSK registers
diff --git a/drivers/crypto/caam/ctrl.h b/drivers/crypto/caam/ctrl.h
index f3ecd67922a7..806f4563990c 100644
--- a/drivers/crypto/caam/ctrl.h
+++ b/drivers/crypto/caam/ctrl.h
@@ -11,4 +11,6 @@
/* Prototypes for backend-level services exposed to APIs */
extern bool caam_dpaa2;
+void caam_dump_rng_regs(struct device *ctrldev);
+
#endif /* CTRL_H */
diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index af61f3a2c0d4..dfc25a458a55 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -493,6 +493,7 @@ struct rngtst {
/* RNG4 TRNG test registers */
struct rng4tst {
#define RTMCTL_ACC BIT(5) /* TRNG access mode */
+#define RTMCTL_ERR BIT(12) /* TRNG error */
#define RTMCTL_PRGM BIT(16) /* 1 -> program mode, 0 -> run mode */
#define RTMCTL_SAMP_MODE_VON_NEUMANN_ES_SC 0 /* use von Neumann data in
both entropy shifter and
@@ -526,7 +527,9 @@ struct rng4tst {
u32 rtfrqmax; /* PRGM=1: freq. count max. limit register */
u32 rtfrqcnt; /* PRGM=0: freq. count register */
};
- u32 rsvd1[40];
+ u32 rsvd[7];
+ u32 rtstatus; /* TRNG status register */
+ u32 rsvd1[32];
#define RDSTA_SKVT 0x80000000
#define RDSTA_SKVN 0x40000000
#define RDSTA_PR0 BIT(4)
--
2.17.1
On Tue, 2021-03-02 at 19:33 +0200, Horia Geantă wrote:
> On 12/14/2020 9:00 PM, Lucas Stach wrote:
> > Hi all,
> >
> > I've been looking into a CAAM RNG issue for a while, where I could need
> > some input from people knowing the CAAM hardware better than I do.
> > Basically the issue is that on some i.MX6 units the RNG functionality
> > sometimes fails with this error:
> > caam_jr 2101000.jr0: 20003c5b: CCB: desc idx 60: RNG: Hardware error.
> >
> > I can tell that it is related to the entropy delay. On all failing
> > units the RNG4 gets instantiated with the default entropy delay of
> > 3200. If I dial up the delay to 3600 or 4000 the RNG works reliably. As
> > a negative test I changed the initial delay to 400. With this change
> > all units are able to successfully instantiate the RNG handles at an
> > entropy delay of 2000 or 2400, but then reliably fail at getting random
> > data with the error shown above. I guess the issue is related to
> > prediction resistance on the handles, which causes the PRNG to be re-
> > seeded from the TRNG fairly often.
> >
> > Now I don't have a good idea on how to arrive at a reliably working
> > entropy delay setting, as apparently the simple "are we able to
> > instantiate the handle" check is not enough to actually guarantee a
> > working RNG setup. Any suggestions?
> >
> The successful instantiation of the RNG state handle(s) means that
> the HW self-tests passed, but this doesn't mean RNG will work flawlessly.
>
> A properly configured RNG should have a certain (very low) failure rate.
> The logic in the caam rng driver is not checking this rate, since it's
> running
> only once with a given configuration.
> OTOH properly checking the RNG configuration would take some time, so it
> would
> be better to run it offline. The "characterization" should also account for
> temperature, voltage and process (fixed for a given SoC).
>
> From this perspective, the caam rng driver should be updated to statically
> configure the RNG with these offline-determined parameters.
> Ideally we'd be able to use a single set of parameters to cover all SoCs
> that have the same IP (RNG4 TRNG).
> Unfortunately we're not there yet.
>
> The situation became more visible after changing the caam rng driver to
> reseed
> the PRNG before every request (practically making the PRNG function like a
> TRNG,
> a hwrng framework requirement), since the HW self-tests are now running more
> often then before.
>
> Some questions that would give me more details about the exact issue you
> and Robert are facing:
>
> 1. What SoC exactly are you running on?
>
> 2. How fast and how often is the RNG hardware error occurring?
> Does this happen at boot time, only when stressing /dev/hwrng etc.?
We are using an iMX6D. In our case, it seems this is occurring relatively
rarely - I have only seen this occur on a few boots. When it has happened, it
started reporting errors at boot and regularly thereafter - probably as a
result of accesses being made by the rngd daemon.
>
> 3. Try dumping some of the RNG registers using below patch:
>
> -- >8 --
>
> Subject: [PATCH] crypto: caam - rng debugging
>
> Dump RNG registers at hwrng.init time and in case descriptor returns
> RNG HW error.
>
> Signed-off-by: Horia Geantă <[email protected]>
> ---
> drivers/crypto/caam/caamrng.c | 9 ++++++++-
> drivers/crypto/caam/ctrl.c | 29 +++++++++++++++++++++++++++++
> drivers/crypto/caam/ctrl.h | 2 ++
> drivers/crypto/caam/regs.h | 5 ++++-
> 4 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
> index 77d048dfe5d0..fc2192183696 100644
> --- a/drivers/crypto/caam/caamrng.c
> +++ b/drivers/crypto/caam/caamrng.c
> @@ -16,6 +16,7 @@
>
> #include "compat.h"
>
> +#include "ctrl.h"
> #include "regs.h"
> #include "intern.h"
> #include "desc_constr.h"
> @@ -57,9 +58,12 @@ static void caam_rng_done(struct device *jrdev, u32 *desc,
> u32 err,
> {
> struct caam_rng_job_ctx *jctx = context;
>
> - if (err)
> + if (err) {
> *jctx->err = caam_jr_strstatus(jrdev, err);
>
> + caam_dump_rng_regs(jrdev);
> + }
> +
> complete(jctx->done);
> }
>
> @@ -199,6 +203,9 @@ static int caam_init(struct hwrng *rng)
> return err;
> }
>
> + dev_dbg(ctx->jrdev, "CAAM RNG - register status at hwrng.init time\n");
> + caam_dump_rng_regs(ctx->jrdev);
> +
> /*
> * Fill async buffer to have early randomness data for
> * hw_random
> diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
> index ca0361b2dbb0..52db32b599aa 100644
> --- a/drivers/crypto/caam/ctrl.c
> +++ b/drivers/crypto/caam/ctrl.c
> @@ -27,6 +27,35 @@ EXPORT_SYMBOL(caam_dpaa2);
> #include "qi.h"
> #endif
>
> +void caam_dump_rng_regs(struct device *jrdev)
> +{
> + struct device *ctrldev = jrdev->parent;
> + struct caam_drv_private *ctrlpriv = dev_get_drvdata(ctrldev);
> + struct caam_ctrl __iomem *ctrl;
> + struct rng4tst __iomem *r4tst;
> + u32 rtmctl;
> +
> + dev_dbg(jrdev, "RNG register dump:\n");
> +
> + ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl;
> + r4tst = &ctrl->r4tst[0];
> +
> + dev_dbg(jrdev, "\trdsta = 0x%08x\n", rd_reg32(&r4tst->rdsta));
> +
> + rtmctl = rd_reg32(&r4tst->rtmctl);
> + dev_dbg(jrdev, "\trtmctl = 0x%08x\n", rtmctl);
> + dev_dbg(jrdev, "\trtstatus = 0x%08x\n", rd_reg32(&r4tst->rtstatus));
> +
> + /* Group of registers that can be read only when RTMCTL[PRGM]=1 */
> + clrsetbits_32(&r4tst->rtmctl, 0, RTMCTL_PRGM | RTMCTL_ACC);
> + dev_dbg(jrdev, "\trtscmisc = 0x%08x\n", rd_reg32(&r4tst->rtscmisc));
> + dev_dbg(jrdev, "\trtfrqmin = 0x%08x\n", rd_reg32(&r4tst->rtfrqmin));
> + dev_dbg(jrdev, "\trtfrqmax = 0x%08x\n", rd_reg32(&r4tst->rtfrqmax));
> + clrsetbits_32(&r4tst->rtmctl, RTMCTL_PRGM | RTMCTL_ACC, RTMCTL_ERR);
> +
> +}
> +EXPORT_SYMBOL(caam_dump_rng_regs);
> +
> /*
> * Descriptor to instantiate RNG State Handle 0 in normal mode and
> * load the JDKEK, TDKEK and TDSK registers
> diff --git a/drivers/crypto/caam/ctrl.h b/drivers/crypto/caam/ctrl.h
> index f3ecd67922a7..806f4563990c 100644
> --- a/drivers/crypto/caam/ctrl.h
> +++ b/drivers/crypto/caam/ctrl.h
> @@ -11,4 +11,6 @@
> /* Prototypes for backend-level services exposed to APIs */
> extern bool caam_dpaa2;
>
> +void caam_dump_rng_regs(struct device *ctrldev);
> +
> #endif /* CTRL_H */
> diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
> index af61f3a2c0d4..dfc25a458a55 100644
> --- a/drivers/crypto/caam/regs.h
> +++ b/drivers/crypto/caam/regs.h
> @@ -493,6 +493,7 @@ struct rngtst {
> /* RNG4 TRNG test registers */
> struct rng4tst {
> #define RTMCTL_ACC BIT(5) /* TRNG access mode */
> +#define RTMCTL_ERR BIT(12) /* TRNG error */
> #define RTMCTL_PRGM BIT(16) /* 1 -> program mode, 0 -> run mode */
> #define RTMCTL_SAMP_MODE_VON_NEUMANN_ES_SC 0 /* use von Neumann data in
> both entropy shifter and
> @@ -526,7 +527,9 @@ struct rng4tst {
> u32 rtfrqmax; /* PRGM=1: freq. count max. limit register */
> u32 rtfrqcnt; /* PRGM=0: freq. count register */
> };
> - u32 rsvd1[40];
> + u32 rsvd[7];
> + u32 rtstatus; /* TRNG status register */
> + u32 rsvd1[32];
> #define RDSTA_SKVT 0x80000000
> #define RDSTA_SKVN 0x40000000
> #define RDSTA_PR0 BIT(4)
--
Robert Hancock
Senior Hardware Designer, Calian Advanced Technologies
http://www.calian.com
From: Petr Benes <[email protected]>
Each time TRNG generates entropy, statistical tests are run.
If they fail, RETRY_COUNT value is decremented. Once it
reaches 0, HW RNG returns an error, and needs to be reset.
RETRY_COUNT could be programmed in RTSCMISC register and is
set to 1 by default. Hence, we are left without hwrng after
the first error, which could happen even under normal
conditions.
Cc: [email protected]
Signed-off-by: Petr Benes <[email protected]>
Signed-off-by: Michal Vokáč <[email protected]>
---
Hi,
we are also experiencing this issue:
caam_jr 2101000.jr0: 20003c5b: CCB: desc idx 60: RNG: Hardware error.
It is happening on both i.MX6S and i.MX6DL SoCs we use.
On Solo I can reproduce it really fast. Sometimes it happens right
after the board is NFS booted, sometimes I need to stress the HWRNG
for a while (generate few hundred KBs of random data). On some
DualLite SoCs it is happening at least once a day.
We are using the v5.10 LTS branch but I can confirm that this is
happening on all kernels since v5.7 to the latest linux-next.
We also tried to increase the RTSDCTL_ENT_DLY_MIN delay as suggested
in this thread [1]. It helped and the issue never occurred since then
but we are looking for more universal and permanent solution suitable
for upstream, hence we came up with this patch.
Any comments will be appreciated.
Thanks, Michal
[1] https://lkml.org/lkml/2021/8/30/296
drivers/crypto/caam/caamrng.c | 42 ++++++++++++++++++++++++++++++++---
drivers/crypto/caam/ctrl.c | 13 +++++++++++
drivers/crypto/caam/ctrl.h | 2 ++
3 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/drivers/crypto/caam/caamrng.c b/drivers/crypto/caam/caamrng.c
index 77d048dfe5d0..2be5584ae591 100644
--- a/drivers/crypto/caam/caamrng.c
+++ b/drivers/crypto/caam/caamrng.c
@@ -21,6 +21,7 @@
#include "desc_constr.h"
#include "jr.h"
#include "error.h"
+#include "ctrl.h"
#define CAAM_RNG_MAX_FIFO_STORE_SIZE 16
@@ -113,6 +114,35 @@ static int caam_rng_read_one(struct device *jrdev,
return err ?: (ret ?: len);
}
+static void caam_rng_retry_reset(struct caam_rng_ctx *context)
+{
+ struct device *ctrldev = context->ctrldev;
+ struct caam_drv_private *ctrlpriv = dev_get_drvdata(ctrldev);
+ struct caam_ctrl __iomem *ctrl;
+ struct rng4tst __iomem *r4tst;
+ u32 __iomem *rtstatus;
+ u32 retry_count;
+
+ ctrl = (struct caam_ctrl __iomem *)ctrlpriv->ctrl;
+ r4tst = &ctrl->r4tst[0];
+
+ /*
+ * There is unfortunately no member for RTSTATUS register in
+ * struct rng4tst and the structure doesn't look stable
+ */
+ rtstatus = (u32 *)((char *)&ctrl->r4tst[0] + 0x3C);
+ retry_count = (rd_reg32(rtstatus) >> 16) & 0xf;
+ dev_dbg(ctrldev, "CAAM RNG retry count %d\n", retry_count);
+ if (retry_count == 0) {
+ dev_err(ctrldev, "CAAM RNG resetting retry count to 1\n");
+ clrsetbits_32(&r4tst->rtmctl, 0, RTMCTL_PRGM | RTMCTL_ACC);
+ wr_reg32(&r4tst->rtscmisc, (rd_reg32(&r4tst->rtscmisc) & 0x7f) | (1 << 16));
+ clrsetbits_32(&r4tst->rtmctl, RTMCTL_PRGM | RTMCTL_ACC,
+ RTMCTL_SAMP_MODE_RAW_ES_SC);
+ caam_reinstantiate_rng(ctrldev);
+ }
+}
+
static void caam_rng_fill_async(struct caam_rng_ctx *ctx)
{
struct scatterlist sg[1];
@@ -129,8 +159,10 @@ static void caam_rng_fill_async(struct caam_rng_ctx *ctx)
sg[0].length,
ctx->desc_async,
&done);
- if (len < 0)
+ if (len < 0) {
+ caam_rng_retry_reset(ctx);
return;
+ }
kfifo_dma_in_finish(&ctx->fifo, len);
}
@@ -145,13 +177,17 @@ static void caam_rng_worker(struct work_struct *work)
static int caam_read(struct hwrng *rng, void *dst, size_t max, bool wait)
{
struct caam_rng_ctx *ctx = to_caam_rng_ctx(rng);
- int out;
+ int out, ret;
if (wait) {
struct completion done;
- return caam_rng_read_one(ctx->jrdev, dst, max,
+ ret = caam_rng_read_one(ctx->jrdev, dst, max,
ctx->desc_sync, &done);
+ if (ret < 0)
+ caam_rng_retry_reset(ctx);
+
+ return ret;
}
out = kfifo_out(&ctx->fifo, dst, max);
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index ca0361b2dbb0..e421f8d1982b 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -339,6 +339,19 @@ static int instantiate_rng(struct device *ctrldev, int state_handle_mask,
return devm_add_action_or_reset(ctrldev, devm_deinstantiate_rng, ctrldev);
}
+/*
+ * caam_reinstantiate_rng - reinstantiates RNG. Intended for a case when RNG falls into
+ * HW error condition. That happens if TRNG fails statistical
+ * check and RTY_CNT value set in RTSCMISC decrements to zero.
+ * It is exported to caamrng.c
+ * @ctrldev - pointer to device
+ */
+
+int caam_reinstantiate_rng(struct device *ctrldev)
+{
+ return instantiate_rng(ctrldev, 0, 0);
+}
+
/*
* kick_trng - sets the various parameters for enabling the initialization
* of the RNG4 block in CAAM
diff --git a/drivers/crypto/caam/ctrl.h b/drivers/crypto/caam/ctrl.h
index f3ecd67922a7..26ff5a49a865 100644
--- a/drivers/crypto/caam/ctrl.h
+++ b/drivers/crypto/caam/ctrl.h
@@ -8,6 +8,8 @@
#ifndef CTRL_H
#define CTRL_H
+int caam_reinstantiate_rng(struct device *ctrldev);
+
/* Prototypes for backend-level services exposed to APIs */
extern bool caam_dpaa2;
--
2.25.1