2021-07-05 06:10:39

by Bharat Bhushan

[permalink] [raw]
Subject: [PATCH] clocksource: Add Marvell Errata-38627 workaround

CPU pipeline have unpredicted behavior when timer
interrupt appears and then disappears prior to the
exception happening. Time interrupt appears on timer
expiry and disappears when timer programming or timer
disable. This typically can happen when a load
instruction misses in the cache, which can take
few hundreds of cycles, and an interrupt appears
after the load instruction starts executing but
disappears before the load instruction completes.

Workaround of this is to ensure maximum 2us of time
gap between timer interrupt and timer programming
which can de-assert timer interrupt.

Signed-off-by: Linu Cherian <[email protected]>
Signed-off-by: Bharat Bhushan <[email protected]>
---
arch/arm64/include/asm/cputype.h | 13 +++++++
arch/arm64/kernel/cpu_errata.c | 24 +++++++++++++
arch/arm64/tools/cpucaps | 1 +
drivers/clocksource/Kconfig | 13 +++++++
drivers/clocksource/arm_arch_timer.c | 54 ++++++++++++++++++++++++++++
5 files changed, 105 insertions(+)

diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 6231e1f0abe7..e9a76935ee0f 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -81,6 +81,13 @@
#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
#define CAVIUM_CPU_PART_THUNDERX2 0x0AF

+#define MRVL_CPU_PART_OCTEONTX2_98XX 0x0B1
+#define MRVL_CPU_PART_OCTEONTX2_96XX 0x0B2
+#define MRVL_CPU_PART_OCTEONTX2_95XX 0x0B3
+#define MRVL_CPU_PART_OCTEONTX2_LOKI 0x0B4
+#define MRVL_CPU_PART_OCTEONTX2_95MM 0x0B5
+#define MRVL_CPU_PART_OCTEONTX2_95O 0x0B6
+
#define BRCM_CPU_PART_BRAHMA_B53 0x100
#define BRCM_CPU_PART_VULCAN 0x516

@@ -117,6 +124,12 @@
#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
+#define MIDR_MRVL_OCTEONTX2_98XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_98XX)
+#define MIDR_MRVL_OCTEONTX2_96XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_96XX)
+#define MIDR_MRVL_OCTEONTX2_95XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95XX)
+#define MIDR_MRVL_OCTEONTX2_LOKI MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_LOKI)
+#define MIDR_MRVL_OCTEONTX2_95MM MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95MM)
+#define MIDR_MRVL_OCTEONTX2_95O MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95O)
#define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_BRAHMA_B53)
#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
#define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index e2c20c036442..363f83adb333 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -339,6 +339,23 @@ static const struct midr_range erratum_1463225[] = {
{},
};
#endif
+#ifdef CONFIG_MARVELL_ERRATUM_38627
+static const struct midr_range marvell_erratum_38627_cpus[] = {
+ /* Marvell OcteonTX 2, 95xx all passes */
+ MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95XX),
+ /* Marvell OcteonTX 2, 95MM all passes */
+ MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95MM),
+ /* Marvell OcteonTX 2, LOKI all passes */
+ MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_LOKI),
+ /* Marvell OcteonTX 2, 96xx all passes */
+ MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_96XX),
+ /* Marvell OcteonTX 2, 98xx pass 1.0 */
+ MIDR_REV(MIDR_MRVL_OCTEONTX2_98XX, 0, 0),
+ /* Marvell OcteonTX 2, 95O pass 1.0 */
+ MIDR_REV(MIDR_MRVL_OCTEONTX2_95O, 0, 0),
+ {},
+};
+#endif

const struct arm64_cpu_capabilities arm64_errata[] = {
#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
@@ -439,6 +456,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
.capability = ARM64_WORKAROUND_858921,
ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
},
+#endif
+#ifdef CONFIG_MARVELL_ERRATUM_38627
+ {
+ .desc = "MARVELL erratum 38627",
+ .capability = ARM64_WORKAROUND_MRVL_38627,
+ ERRATA_MIDR_RANGE_LIST(marvell_erratum_38627_cpus),
+ },
#endif
{
.desc = "Spectre-v2",
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 49305c2e6dfd..99dbe80445ef 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -60,6 +60,7 @@ WORKAROUND_CAVIUM_TX2_219_PRFM
WORKAROUND_CAVIUM_TX2_219_TVM
WORKAROUND_CLEAN_CACHE
WORKAROUND_DEVICE_LOAD_ACQUIRE
+WORKAROUND_MRVL_38627
WORKAROUND_NVIDIA_CARMEL_CNP
WORKAROUND_QCOM_FALKOR_E1003
WORKAROUND_REPEAT_TLBI
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index eb661b539a3e..83feb502cf02 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -309,6 +309,19 @@ config ARM_ARCH_TIMER_EVTSTREAM
config ARM_ARCH_TIMER_OOL_WORKAROUND
bool

+config MARVELL_ERRATUM_38627
+ bool "Workaround for Marvell Erratum 38627"
+ default y
+ depends on ARM_ARCH_TIMER && ARM64
+ select ARM_ARCH_TIMER_OOL_WORKAROUND
+ help
+ This option enables a workaround for Marvell Erratum
+ 38627. According to this errata CPU pipeline have
+ unpredicted behavior when timer interrupt appears and
+ then disappears prior to the exception happening.
+ This Errata workaround is applicable only to some Marvell
+ OcteonTX2 series of processors.
+
config FSL_ERRATUM_A008585
bool "Workaround for Freescale/NXP Erratum A-008585"
default y
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index be6d741d404c..d1c5e2aa2e7c 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -27,6 +27,7 @@
#include <linux/acpi.h>
#include <linux/arm-smccc.h>
#include <linux/ptp_kvm.h>
+#include <linux/delay.h>

#include <asm/arch_timer.h>
#include <asm/virt.h>
@@ -88,6 +89,8 @@ static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE;

static cpumask_t evtstrm_available = CPU_MASK_NONE;
static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
+static __always_inline void set_next_event(const int access, unsigned long evt,
+ struct clock_event_device *clk);

static int __init early_evtstrm_cfg(char *buf)
{
@@ -432,6 +435,48 @@ static __maybe_unused int erratum_set_next_event_tval_phys(unsigned long evt,
return 0;
}

+#ifdef CONFIG_MARVELL_ERRATUM_38627
+/* Workaround is to ensure maximum 2us of time gap between timer expiry
+ * and timer programming which can de-assert timer interrupt.
+ * Time calculation below is based on 100MHz as timer frequency is fixed
+ * to 100MHz on all affected parts.
+ */
+static __always_inline
+void erratum_38627_set_next_event(const int access, unsigned long evt,
+ struct clock_event_device *clk)
+{
+ int32_t tval;
+
+ tval = arch_timer_reg_read(access, ARCH_TIMER_REG_TVAL, clk);
+
+ /* Timer already expired, wait for (2 - expired time)us */
+ if ((tval > -200) && (tval < 0))
+ udelay(2 + tval/100);
+
+ /* Timer is about to expire, wait for 2us + time to expire */
+ if (tval >= 0 && tval < 200)
+ udelay(3 + tval/100);
+
+ set_next_event(access, evt, clk);
+}
+
+static __maybe_unused
+int erratum_38627_set_next_event_tval_virt(unsigned long evt,
+ struct clock_event_device *clk)
+{
+ erratum_38627_set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
+ return 0;
+}
+
+static __maybe_unused
+int erratum_38627_set_next_event_tval_phys(unsigned long evt,
+ struct clock_event_device *clk)
+{
+ erratum_38627_set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
+ return 0;
+}
+#endif
+
static const struct arch_timer_erratum_workaround ool_workarounds[] = {
#ifdef CONFIG_FSL_ERRATUM_A008585
{
@@ -479,6 +524,15 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
.read_cntvct_el0 = arm64_858921_read_cntvct_el0,
},
#endif
+#ifdef CONFIG_MARVELL_ERRATUM_38627
+ {
+ .match_type = ate_match_local_cap_id,
+ .id = (void *)ARM64_WORKAROUND_MRVL_38627,
+ .desc = "Marvell erratum 38627",
+ .set_next_event_phys = erratum_38627_set_next_event_tval_phys,
+ .set_next_event_virt = erratum_38627_set_next_event_tval_virt,
+ },
+#endif
#ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
{
.match_type = ate_match_dt,
--
2.17.1


2021-07-05 09:10:19

by Mark Rutland

[permalink] [raw]
Subject: Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Bharat,

On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> CPU pipeline have unpredicted behavior when timer
> interrupt appears and then disappears prior to the
> exception happening. Time interrupt appears on timer
> expiry and disappears when timer programming or timer
> disable. This typically can happen when a load
> instruction misses in the cache, which can take
> few hundreds of cycles, and an interrupt appears
> after the load instruction starts executing but
> disappears before the load instruction completes.

Could you elaborate on the scenario? What sort of unpredictable
behaviour can occur? e.g:

* Does the CPU lockup?
* Does the CPU take the exception at all?
* Does the load behave erroneously?
* Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?

Does the problem manifest when IRQs are masked by DAIF.I, or by
CNT8_CTL_EL0.{IMASK,ENABLE} ?

> Workaround of this is to ensure maximum 2us of time
> gap between timer interrupt and timer programming
> which can de-assert timer interrupt.

The code below seems to try to enforce a 2us *minimum*. Which is it
supposed to be?

Can you explain *why* this is supposed to help?

I don't see how we can guarantee this in a VM, or if the CPU misses on
an instruction fetch.

> Signed-off-by: Linu Cherian <[email protected]>
> Signed-off-by: Bharat Bhushan <[email protected]>
> ---
> arch/arm64/include/asm/cputype.h | 13 +++++++
> arch/arm64/kernel/cpu_errata.c | 24 +++++++++++++
> arch/arm64/tools/cpucaps | 1 +
> drivers/clocksource/Kconfig | 13 +++++++
> drivers/clocksource/arm_arch_timer.c | 54 ++++++++++++++++++++++++++++
> 5 files changed, 105 insertions(+)
>
> diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
> index 6231e1f0abe7..e9a76935ee0f 100644
> --- a/arch/arm64/include/asm/cputype.h
> +++ b/arch/arm64/include/asm/cputype.h
> @@ -81,6 +81,13 @@
> #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
> #define CAVIUM_CPU_PART_THUNDERX2 0x0AF
>
> +#define MRVL_CPU_PART_OCTEONTX2_98XX 0x0B1
> +#define MRVL_CPU_PART_OCTEONTX2_96XX 0x0B2
> +#define MRVL_CPU_PART_OCTEONTX2_95XX 0x0B3
> +#define MRVL_CPU_PART_OCTEONTX2_LOKI 0x0B4
> +#define MRVL_CPU_PART_OCTEONTX2_95MM 0x0B5
> +#define MRVL_CPU_PART_OCTEONTX2_95O 0x0B6
> +
> #define BRCM_CPU_PART_BRAHMA_B53 0x100
> #define BRCM_CPU_PART_VULCAN 0x516
>
> @@ -117,6 +124,12 @@
> #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
> #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
> #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
> +#define MIDR_MRVL_OCTEONTX2_98XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_98XX)
> +#define MIDR_MRVL_OCTEONTX2_96XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_96XX)
> +#define MIDR_MRVL_OCTEONTX2_95XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95XX)
> +#define MIDR_MRVL_OCTEONTX2_LOKI MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_LOKI)
> +#define MIDR_MRVL_OCTEONTX2_95MM MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95MM)
> +#define MIDR_MRVL_OCTEONTX2_95O MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95O)
> #define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_BRAHMA_B53)
> #define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
> #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index e2c20c036442..363f83adb333 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -339,6 +339,23 @@ static const struct midr_range erratum_1463225[] = {
> {},
> };
> #endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> +static const struct midr_range marvell_erratum_38627_cpus[] = {
> + /* Marvell OcteonTX 2, 95xx all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95XX),
> + /* Marvell OcteonTX 2, 95MM all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95MM),
> + /* Marvell OcteonTX 2, LOKI all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_LOKI),
> + /* Marvell OcteonTX 2, 96xx all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_96XX),
> + /* Marvell OcteonTX 2, 98xx pass 1.0 */
> + MIDR_REV(MIDR_MRVL_OCTEONTX2_98XX, 0, 0),
> + /* Marvell OcteonTX 2, 95O pass 1.0 */
> + MIDR_REV(MIDR_MRVL_OCTEONTX2_95O, 0, 0),
> + {},
> +};
> +#endif
>
> const struct arm64_cpu_capabilities arm64_errata[] = {
> #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
> @@ -439,6 +456,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
> .capability = ARM64_WORKAROUND_858921,
> ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
> },
> +#endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> + {
> + .desc = "MARVELL erratum 38627",
> + .capability = ARM64_WORKAROUND_MRVL_38627,
> + ERRATA_MIDR_RANGE_LIST(marvell_erratum_38627_cpus),
> + },
> #endif
> {
> .desc = "Spectre-v2",
> diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
> index 49305c2e6dfd..99dbe80445ef 100644
> --- a/arch/arm64/tools/cpucaps
> +++ b/arch/arm64/tools/cpucaps
> @@ -60,6 +60,7 @@ WORKAROUND_CAVIUM_TX2_219_PRFM
> WORKAROUND_CAVIUM_TX2_219_TVM
> WORKAROUND_CLEAN_CACHE
> WORKAROUND_DEVICE_LOAD_ACQUIRE
> +WORKAROUND_MRVL_38627
> WORKAROUND_NVIDIA_CARMEL_CNP
> WORKAROUND_QCOM_FALKOR_E1003
> WORKAROUND_REPEAT_TLBI
> diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
> index eb661b539a3e..83feb502cf02 100644
> --- a/drivers/clocksource/Kconfig
> +++ b/drivers/clocksource/Kconfig
> @@ -309,6 +309,19 @@ config ARM_ARCH_TIMER_EVTSTREAM
> config ARM_ARCH_TIMER_OOL_WORKAROUND
> bool
>
> +config MARVELL_ERRATUM_38627
> + bool "Workaround for Marvell Erratum 38627"
> + default y
> + depends on ARM_ARCH_TIMER && ARM64
> + select ARM_ARCH_TIMER_OOL_WORKAROUND
> + help
> + This option enables a workaround for Marvell Erratum
> + 38627. According to this errata CPU pipeline have
> + unpredicted behavior when timer interrupt appears and
> + then disappears prior to the exception happening.
> + This Errata workaround is applicable only to some Marvell
> + OcteonTX2 series of processors.
> +
> config FSL_ERRATUM_A008585
> bool "Workaround for Freescale/NXP Erratum A-008585"
> default y
> diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
> index be6d741d404c..d1c5e2aa2e7c 100644
> --- a/drivers/clocksource/arm_arch_timer.c
> +++ b/drivers/clocksource/arm_arch_timer.c
> @@ -27,6 +27,7 @@
> #include <linux/acpi.h>
> #include <linux/arm-smccc.h>
> #include <linux/ptp_kvm.h>
> +#include <linux/delay.h>
>
> #include <asm/arch_timer.h>
> #include <asm/virt.h>
> @@ -88,6 +89,8 @@ static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE;
>
> static cpumask_t evtstrm_available = CPU_MASK_NONE;
> static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
> +static __always_inline void set_next_event(const int access, unsigned long evt,
> + struct clock_event_device *clk);
>
> static int __init early_evtstrm_cfg(char *buf)
> {
> @@ -432,6 +435,48 @@ static __maybe_unused int erratum_set_next_event_tval_phys(unsigned long evt,
> return 0;
> }
>
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> +/* Workaround is to ensure maximum 2us of time gap between timer expiry
> + * and timer programming which can de-assert timer interrupt.
> + * Time calculation below is based on 100MHz as timer frequency is fixed
> + * to 100MHz on all affected parts.
> + */
> +static __always_inline
> +void erratum_38627_set_next_event(const int access, unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + int32_t tval;
> +
> + tval = arch_timer_reg_read(access, ARCH_TIMER_REG_TVAL, clk);
> +
> + /* Timer already expired, wait for (2 - expired time)us */
> + if ((tval > -200) && (tval < 0))
> + udelay(2 + tval/100);

Isn't this ensuring a 2us *minimum* rather than *maximum* ?

> +
> + /* Timer is about to expire, wait for 2us + time to expire */
> + if (tval >= 0 && tval < 200)
> + udelay(3 + tval/100);

Again, this appears to be waiting for 2us *minimum*.

What if when we did the read we have 2.01 us to go? Is the expectation
that we'll manage to execute this code and set the next event in less
that 2us?

Thanks,
Mark.

> +
> + set_next_event(access, evt, clk);
> +}
> +
> +static __maybe_unused
> +int erratum_38627_set_next_event_tval_virt(unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + erratum_38627_set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
> + return 0;
> +}
> +
> +static __maybe_unused
> +int erratum_38627_set_next_event_tval_phys(unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + erratum_38627_set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
> + return 0;
> +}
> +#endif
> +
> static const struct arch_timer_erratum_workaround ool_workarounds[] = {
> #ifdef CONFIG_FSL_ERRATUM_A008585
> {
> @@ -479,6 +524,15 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
> .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
> },
> #endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> + {
> + .match_type = ate_match_local_cap_id,
> + .id = (void *)ARM64_WORKAROUND_MRVL_38627,
> + .desc = "Marvell erratum 38627",
> + .set_next_event_phys = erratum_38627_set_next_event_tval_phys,
> + .set_next_event_virt = erratum_38627_set_next_event_tval_virt,
> + },
> +#endif
> #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
> {
> .match_type = ate_match_dt,
> --
> 2.17.1
>

2021-07-05 09:17:53

by Mark Rutland

[permalink] [raw]
Subject: Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

On Mon, Jul 05, 2021 at 10:07:53AM +0100, Mark Rutland wrote:
> Hi Bharat,
>
> On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > CPU pipeline have unpredicted behavior when timer
> > interrupt appears and then disappears prior to the
> > exception happening. Time interrupt appears on timer
> > expiry and disappears when timer programming or timer
> > disable. This typically can happen when a load
> > instruction misses in the cache, which can take
> > few hundreds of cycles, and an interrupt appears
> > after the load instruction starts executing but
> > disappears before the load instruction completes.
>
> Could you elaborate on the scenario? What sort of unpredictable
> behaviour can occur? e.g:
>
> * Does the CPU lockup?
> * Does the CPU take the exception at all?
> * Does the load behave erroneously?
> * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?
>
> Does the problem manifest when IRQs are masked by DAIF.I, or by
> CNT8_CTL_EL0.{IMASK,ENABLE} ?

Whoops, that was supposed to say:

| CNT*_CTL_EL0.{IMASK,ENABLE}

... i.e. those fields in either CNTP_CTL_EL0 or CNTV_CTL_EL0.

Thanks,
Mark.

2021-07-05 09:30:33

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

On Mon, 05 Jul 2021 07:08:43 +0100,
Bharat Bhushan <[email protected]> wrote:
>
> CPU pipeline have unpredicted behavior when timer
> interrupt appears and then disappears prior to the
> exception happening.

What kind of unpredictable behaviours? What happens if a guest isn't
aware of the erratum or actively tries to trigger it?

> Time interrupt appears on timer
> expiry and disappears when timer programming or timer
> disable. This typically can happen when a load
> instruction misses in the cache, which can take
> few hundreds of cycles, and an interrupt appears
> after the load instruction starts executing but
> disappears before the load instruction completes.
>
> Workaround of this is to ensure maximum 2us of time

maximum? I'm not sure how you can bound this. Or did you mean
*minimum*?

How was this value obtained? What guarantees that it is safe?

> gap between timer interrupt and timer programming
> which can de-assert timer interrupt.

What guarantees do you have on the propagation of the interrupt signal
from the timer to the CPU, given that the GIC is involved in the
middle of it?

>
> Signed-off-by: Linu Cherian <[email protected]>
> Signed-off-by: Bharat Bhushan <[email protected]>
> ---
> arch/arm64/include/asm/cputype.h | 13 +++++++
> arch/arm64/kernel/cpu_errata.c | 24 +++++++++++++
> arch/arm64/tools/cpucaps | 1 +
> drivers/clocksource/Kconfig | 13 +++++++
> drivers/clocksource/arm_arch_timer.c | 54 ++++++++++++++++++++++++++++
> 5 files changed, 105 insertions(+)
>
> diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
> index 6231e1f0abe7..e9a76935ee0f 100644
> --- a/arch/arm64/include/asm/cputype.h
> +++ b/arch/arm64/include/asm/cputype.h
> @@ -81,6 +81,13 @@
> #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
> #define CAVIUM_CPU_PART_THUNDERX2 0x0AF
>
> +#define MRVL_CPU_PART_OCTEONTX2_98XX 0x0B1
> +#define MRVL_CPU_PART_OCTEONTX2_96XX 0x0B2
> +#define MRVL_CPU_PART_OCTEONTX2_95XX 0x0B3
> +#define MRVL_CPU_PART_OCTEONTX2_LOKI 0x0B4
> +#define MRVL_CPU_PART_OCTEONTX2_95MM 0x0B5
> +#define MRVL_CPU_PART_OCTEONTX2_95O 0x0B6
> +
> #define BRCM_CPU_PART_BRAHMA_B53 0x100
> #define BRCM_CPU_PART_VULCAN 0x516
>
> @@ -117,6 +124,12 @@
> #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
> #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
> #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
> +#define MIDR_MRVL_OCTEONTX2_98XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_98XX)
> +#define MIDR_MRVL_OCTEONTX2_96XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_96XX)
> +#define MIDR_MRVL_OCTEONTX2_95XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95XX)
> +#define MIDR_MRVL_OCTEONTX2_LOKI MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_LOKI)
> +#define MIDR_MRVL_OCTEONTX2_95MM MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95MM)
> +#define MIDR_MRVL_OCTEONTX2_95O MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, MRVL_CPU_PART_OCTEONTX2_95O)
> #define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_BRAHMA_B53)
> #define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
> #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)

This part should be is a separate patch.

> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index e2c20c036442..363f83adb333 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -339,6 +339,23 @@ static const struct midr_range erratum_1463225[] = {
> {},
> };
> #endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> +static const struct midr_range marvell_erratum_38627_cpus[] = {
> + /* Marvell OcteonTX 2, 95xx all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95XX),
> + /* Marvell OcteonTX 2, 95MM all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95MM),
> + /* Marvell OcteonTX 2, LOKI all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_LOKI),
> + /* Marvell OcteonTX 2, 96xx all passes */
> + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_96XX),
> + /* Marvell OcteonTX 2, 98xx pass 1.0 */
> + MIDR_REV(MIDR_MRVL_OCTEONTX2_98XX, 0, 0),
> + /* Marvell OcteonTX 2, 95O pass 1.0 */
> + MIDR_REV(MIDR_MRVL_OCTEONTX2_95O, 0, 0),

Is there any part that is *not* affected?

> + {},
> +};
> +#endif
>
> const struct arm64_cpu_capabilities arm64_errata[] = {
> #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
> @@ -439,6 +456,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
> .capability = ARM64_WORKAROUND_858921,
> ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
> },
> +#endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> + {
> + .desc = "MARVELL erratum 38627",
> + .capability = ARM64_WORKAROUND_MRVL_38627,
> + ERRATA_MIDR_RANGE_LIST(marvell_erratum_38627_cpus),
> + },
> #endif
> {
> .desc = "Spectre-v2",
> diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
> index 49305c2e6dfd..99dbe80445ef 100644
> --- a/arch/arm64/tools/cpucaps
> +++ b/arch/arm64/tools/cpucaps
> @@ -60,6 +60,7 @@ WORKAROUND_CAVIUM_TX2_219_PRFM
> WORKAROUND_CAVIUM_TX2_219_TVM
> WORKAROUND_CLEAN_CACHE
> WORKAROUND_DEVICE_LOAD_ACQUIRE
> +WORKAROUND_MRVL_38627
> WORKAROUND_NVIDIA_CARMEL_CNP
> WORKAROUND_QCOM_FALKOR_E1003
> WORKAROUND_REPEAT_TLBI
> diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
> index eb661b539a3e..83feb502cf02 100644
> --- a/drivers/clocksource/Kconfig
> +++ b/drivers/clocksource/Kconfig
> @@ -309,6 +309,19 @@ config ARM_ARCH_TIMER_EVTSTREAM
> config ARM_ARCH_TIMER_OOL_WORKAROUND
> bool
>
> +config MARVELL_ERRATUM_38627
> + bool "Workaround for Marvell Erratum 38627"
> + default y
> + depends on ARM_ARCH_TIMER && ARM64
> + select ARM_ARCH_TIMER_OOL_WORKAROUND
> + help
> + This option enables a workaround for Marvell Erratum
> + 38627. According to this errata CPU pipeline have
> + unpredicted behavior when timer interrupt appears and
> + then disappears prior to the exception happening.

This wording is actually a lot clearer than the commit message. What
happens if a guest plays with the timer?

> + This Errata workaround is applicable only to some Marvell
> + OcteonTX2 series of processors.
> +
> config FSL_ERRATUM_A008585
> bool "Workaround for Freescale/NXP Erratum A-008585"
> default y
> diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
> index be6d741d404c..d1c5e2aa2e7c 100644
> --- a/drivers/clocksource/arm_arch_timer.c
> +++ b/drivers/clocksource/arm_arch_timer.c
> @@ -27,6 +27,7 @@
> #include <linux/acpi.h>
> #include <linux/arm-smccc.h>
> #include <linux/ptp_kvm.h>
> +#include <linux/delay.h>
>
> #include <asm/arch_timer.h>
> #include <asm/virt.h>
> @@ -88,6 +89,8 @@ static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE;
>
> static cpumask_t evtstrm_available = CPU_MASK_NONE;
> static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
> +static __always_inline void set_next_event(const int access, unsigned long evt,
> + struct clock_event_device *clk);
>
> static int __init early_evtstrm_cfg(char *buf)
> {
> @@ -432,6 +435,48 @@ static __maybe_unused int erratum_set_next_event_tval_phys(unsigned long evt,
> return 0;
> }
>
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> +/* Workaround is to ensure maximum 2us of time gap between timer expiry
> + * and timer programming which can de-assert timer interrupt.
> + * Time calculation below is based on 100MHz as timer frequency is fixed
> + * to 100MHz on all affected parts.
> + */

Please fix the comment style.

> +static __always_inline

Please drop this __always_inline. There is no hard requirement for it,
and you are about to *wait*, so waiting faster is... not that
interesting. The compiler will figure it out.

> +void erratum_38627_set_next_event(const int access, unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + int32_t tval;
> +
> + tval = arch_timer_reg_read(access, ARCH_TIMER_REG_TVAL, clk);
> +
> + /* Timer already expired, wait for (2 - expired time)us */
> + if ((tval > -200) && (tval < 0))
> + udelay(2 + tval/100);
> +
> + /* Timer is about to expire, wait for 2us + time to expire */
> + if (tval >= 0 && tval < 200)
> + udelay(3 + tval/100);

I read 3us here, which contradicts the comment. Also, even if you wait
for an arbitrary amount of time, I still don't see what guarantees you
may have that the interrupt will have actually be signalled.

> +
> + set_next_event(access, evt, clk);
> +}
> +
> +static __maybe_unused
> +int erratum_38627_set_next_event_tval_virt(unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + erratum_38627_set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
> + return 0;
> +}
> +
> +static __maybe_unused
> +int erratum_38627_set_next_event_tval_phys(unsigned long evt,
> + struct clock_event_device *clk)
> +{
> + erratum_38627_set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
> + return 0;
> +}
> +#endif
> +
> static const struct arch_timer_erratum_workaround ool_workarounds[] = {
> #ifdef CONFIG_FSL_ERRATUM_A008585
> {
> @@ -479,6 +524,15 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = {
> .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
> },
> #endif
> +#ifdef CONFIG_MARVELL_ERRATUM_38627
> + {
> + .match_type = ate_match_local_cap_id,
> + .id = (void *)ARM64_WORKAROUND_MRVL_38627,
> + .desc = "Marvell erratum 38627",
> + .set_next_event_phys = erratum_38627_set_next_event_tval_phys,
> + .set_next_event_virt = erratum_38627_set_next_event_tval_virt,
> + },
> +#endif
> #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
> {
> .match_type = ate_match_dt,

You need to update Documentation/arm64/silicon-errata.rst.

Overall, this patch raises *a lot* of questions...

M.

--
Without deviation from the norm, progress is not possible.

2021-07-08 10:49:49

by Bharat Bhushan

[permalink] [raw]
Subject: RE: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Mark,

Sorry for the delay, was gathering some details.
Pease see inline

> -----Original Message-----
> From: Mark Rutland <[email protected]>
> Sent: Monday, July 5, 2021 2:38 PM
> To: Bharat Bhushan <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-arm-
> [email protected]; [email protected]; Linu Cherian
> <[email protected]>
> Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround
>
> External Email
>
> ----------------------------------------------------------------------
> Hi Bharat,
>
> On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > CPU pipeline have unpredicted behavior when timer interrupt appears
> > and then disappears prior to the exception happening. Time interrupt
> > appears on timer expiry and disappears when timer programming or timer
> > disable. This typically can happen when a load instruction misses in
> > the cache, which can take few hundreds of cycles, and an interrupt
> > appears after the load instruction starts executing but disappears
> > before the load instruction completes.
>
> Could you elaborate on the scenario? What sort of unpredictable behaviour can
> occur? e.g:

This is a race condition where an instruction (except store, system, load atomic and load exclusive) becomes "nop" if interrupt appears and disappears before taken by CPU. For example interrupt appears after the atomic load instruction starts executing and disappears before the atomic load instruction completes, in that case instruction (not all) can become "nop". As interrupt disappears before atomic instruction completes, cpu continues to execute and while take junk from register as other dependent got "nop".
>
> * Does the CPU lockup?
No

> * Does the CPU take the exception at all?
No

> * Does the load behave erroneously?
No,

> * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?

yes, GPRs will get corrupted, will have stale value

>
> Does the problem manifest when IRQs are masked by DAIF.I, or by
> CNT*_CTL_EL0.{IMASK,ENABLE} ?

No, there are no issue if interrupts are masked.

>
> > Workaround of this is to ensure maximum 2us of time gap between timer
> > interrupt and timer programming which can de-assert timer interrupt.
>
> The code below seems to try to enforce a 2us *minimum*. Which is it supposed
> to be?

Yes, it is minimum 2us.

>
> Can you explain *why* this is supposed to help?
With the workaround interrupt assertion and de-assertion will be minimum 2us apart.

>
> I don't see how we can guarantee this in a VM, or if the CPU misses on an
> instruction fetch.

This errata applies to VM (virtual timer) as well, maybe there is some gap in my understanding, how it will be different in VM.
Can you help with what issue we can have VM?

>
> > Signed-off-by: Linu Cherian <[email protected]>
> > Signed-off-by: Bharat Bhushan <[email protected]>
> > ---
> > arch/arm64/include/asm/cputype.h | 13 +++++++
> > arch/arm64/kernel/cpu_errata.c | 24 +++++++++++++
> > arch/arm64/tools/cpucaps | 1 +
> > drivers/clocksource/Kconfig | 13 +++++++
> > drivers/clocksource/arm_arch_timer.c | 54
> > ++++++++++++++++++++++++++++
> > 5 files changed, 105 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/cputype.h
> > b/arch/arm64/include/asm/cputype.h
> > index 6231e1f0abe7..e9a76935ee0f 100644
> > --- a/arch/arm64/include/asm/cputype.h
> > +++ b/arch/arm64/include/asm/cputype.h
> > @@ -81,6 +81,13 @@
> > #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
> > #define CAVIUM_CPU_PART_THUNDERX2 0x0AF
> >
> > +#define MRVL_CPU_PART_OCTEONTX2_98XX 0x0B1
> > +#define MRVL_CPU_PART_OCTEONTX2_96XX 0x0B2
> > +#define MRVL_CPU_PART_OCTEONTX2_95XX 0x0B3
> > +#define MRVL_CPU_PART_OCTEONTX2_LOKI 0x0B4
> > +#define MRVL_CPU_PART_OCTEONTX2_95MM 0x0B5
> > +#define MRVL_CPU_PART_OCTEONTX2_95O 0x0B6
> > +
> > #define BRCM_CPU_PART_BRAHMA_B53 0x100
> > #define BRCM_CPU_PART_VULCAN 0x516
> >
> > @@ -117,6 +124,12 @@
> > #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX
> > MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> CAVIUM_CPU_PART_THUNDERX_83XX)
> > #define MIDR_CAVIUM_THUNDERX2
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > CAVIUM_CPU_PART_THUNDERX2)
> > +#define MIDR_MRVL_OCTEONTX2_98XX
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_98XX) #define MIDR_MRVL_OCTEONTX2_96XX
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_96XX)
> > +#define MIDR_MRVL_OCTEONTX2_95XX
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_95XX) #define MIDR_MRVL_OCTEONTX2_LOKI
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_LOKI)
> > +#define MIDR_MRVL_OCTEONTX2_95MM
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_95MM) #define
> MIDR_MRVL_OCTEONTX2_95O
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_95O)
> > #define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM,
> > BRCM_CPU_PART_BRAHMA_B53) #define MIDR_BRCM_VULCAN
> > MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
> #define
> > MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM,
> > QCOM_CPU_PART_FALKOR_V1) diff --git a/arch/arm64/kernel/cpu_errata.c
> > b/arch/arm64/kernel/cpu_errata.c index e2c20c036442..363f83adb333
> > 100644
> > --- a/arch/arm64/kernel/cpu_errata.c
> > +++ b/arch/arm64/kernel/cpu_errata.c
> > @@ -339,6 +339,23 @@ static const struct midr_range erratum_1463225[] = {
> > {},
> > };
> > #endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > +static const struct midr_range marvell_erratum_38627_cpus[] = {
> > + /* Marvell OcteonTX 2, 95xx all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95XX),
> > + /* Marvell OcteonTX 2, 95MM all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95MM),
> > + /* Marvell OcteonTX 2, LOKI all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_LOKI),
> > + /* Marvell OcteonTX 2, 96xx all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_96XX),
> > + /* Marvell OcteonTX 2, 98xx pass 1.0 */
> > + MIDR_REV(MIDR_MRVL_OCTEONTX2_98XX, 0, 0),
> > + /* Marvell OcteonTX 2, 95O pass 1.0 */
> > + MIDR_REV(MIDR_MRVL_OCTEONTX2_95O, 0, 0),
> > + {},
> > +};
> > +#endif
> >
> > const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef
> > CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
> > @@ -439,6 +456,13 @@ const struct arm64_cpu_capabilities arm64_errata[] =
> {
> > .capability = ARM64_WORKAROUND_858921,
> > ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
> > },
> > +#endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > + {
> > + .desc = "MARVELL erratum 38627",
> > + .capability = ARM64_WORKAROUND_MRVL_38627,
> > + ERRATA_MIDR_RANGE_LIST(marvell_erratum_38627_cpus),
> > + },
> > #endif
> > {
> > .desc = "Spectre-v2",
> > diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index
> > 49305c2e6dfd..99dbe80445ef 100644
> > --- a/arch/arm64/tools/cpucaps
> > +++ b/arch/arm64/tools/cpucaps
> > @@ -60,6 +60,7 @@ WORKAROUND_CAVIUM_TX2_219_PRFM
> > WORKAROUND_CAVIUM_TX2_219_TVM WORKAROUND_CLEAN_CACHE
> > WORKAROUND_DEVICE_LOAD_ACQUIRE
> > +WORKAROUND_MRVL_38627
> > WORKAROUND_NVIDIA_CARMEL_CNP
> > WORKAROUND_QCOM_FALKOR_E1003
> > WORKAROUND_REPEAT_TLBI
> > diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
> > index eb661b539a3e..83feb502cf02 100644
> > --- a/drivers/clocksource/Kconfig
> > +++ b/drivers/clocksource/Kconfig
> > @@ -309,6 +309,19 @@ config ARM_ARCH_TIMER_EVTSTREAM config
> > ARM_ARCH_TIMER_OOL_WORKAROUND
> > bool
> >
> > +config MARVELL_ERRATUM_38627
> > + bool "Workaround for Marvell Erratum 38627"
> > + default y
> > + depends on ARM_ARCH_TIMER && ARM64
> > + select ARM_ARCH_TIMER_OOL_WORKAROUND
> > + help
> > + This option enables a workaround for Marvell Erratum
> > + 38627. According to this errata CPU pipeline have
> > + unpredicted behavior when timer interrupt appears and
> > + then disappears prior to the exception happening.
> > + This Errata workaround is applicable only to some Marvell
> > + OcteonTX2 series of processors.
> > +
> > config FSL_ERRATUM_A008585
> > bool "Workaround for Freescale/NXP Erratum A-008585"
> > default y
> > diff --git a/drivers/clocksource/arm_arch_timer.c
> > b/drivers/clocksource/arm_arch_timer.c
> > index be6d741d404c..d1c5e2aa2e7c 100644
> > --- a/drivers/clocksource/arm_arch_timer.c
> > +++ b/drivers/clocksource/arm_arch_timer.c
> > @@ -27,6 +27,7 @@
> > #include <linux/acpi.h>
> > #include <linux/arm-smccc.h>
> > #include <linux/ptp_kvm.h>
> > +#include <linux/delay.h>
> >
> > #include <asm/arch_timer.h>
> > #include <asm/virt.h>
> > @@ -88,6 +89,8 @@ static enum vdso_clock_mode vdso_default =
> > VDSO_CLOCKMODE_NONE;
> >
> > static cpumask_t evtstrm_available = CPU_MASK_NONE; static bool
> > evtstrm_enable __ro_after_init =
> > IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
> > +static __always_inline void set_next_event(const int access, unsigned long evt,
> > + struct clock_event_device *clk);
> >
> > static int __init early_evtstrm_cfg(char *buf) { @@ -432,6 +435,48
> > @@ static __maybe_unused int erratum_set_next_event_tval_phys(unsigned
> long evt,
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > +/* Workaround is to ensure maximum 2us of time gap between timer
> > +expiry
> > + * and timer programming which can de-assert timer interrupt.
> > + * Time calculation below is based on 100MHz as timer frequency is
> > +fixed
> > + * to 100MHz on all affected parts.
> > + */
> > +static __always_inline
> > +void erratum_38627_set_next_event(const int access, unsigned long evt,
> > + struct clock_event_device *clk) {
> > + int32_t tval;
> > +
> > + tval = arch_timer_reg_read(access, ARCH_TIMER_REG_TVAL, clk);
> > +
> > + /* Timer already expired, wait for (2 - expired time)us */
> > + if ((tval > -200) && (tval < 0))
> > + udelay(2 + tval/100);
>
> Isn't this ensuring a 2us *minimum* rather than *maximum* ?

Minimum 2us, will correct
>
> > +
> > + /* Timer is about to expire, wait for 2us + time to expire */
> > + if (tval >= 0 && tval < 200)
> > + udelay(3 + tval/100);
>
> Again, this appears to be waiting for 2us *minimum*.

Yes, minimum 2us,

If "0 < tval < 100", in that case we want to wait 2us after tval expiry, so that turn out to be 2us + 1us (round up of 0 < tval < 100) = 3us.
For 99 < tval < 200, in that case we want to wait 2us after tval expiry, In that case that turn out to be 2us + 2us (round up of 99 < tval < 200) = 4us.

>
> What if when we did the read we have 2.01 us to go? Is the expectation that we'll
> manage to execute this code and set the next event in less that 2us?

2us is safe value and already considers the time between reading of tval and programming timer.
Expectation is that it will take few cycles in programming timer after reading tval.

Thanks
-Bharat

>
> Thanks,
> Mark.
>
> > +
> > + set_next_event(access, evt, clk);
> > +}
> > +
> > +static __maybe_unused
> > +int erratum_38627_set_next_event_tval_virt(unsigned long evt,
> > + struct clock_event_device *clk) {
> > + erratum_38627_set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
> > + return 0;
> > +}
> > +
> > +static __maybe_unused
> > +int erratum_38627_set_next_event_tval_phys(unsigned long evt,
> > + struct clock_event_device *clk) {
> > + erratum_38627_set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
> > + return 0;
> > +}
> > +#endif
> > +
> > static const struct arch_timer_erratum_workaround ool_workarounds[] =
> > { #ifdef CONFIG_FSL_ERRATUM_A008585
> > {
> > @@ -479,6 +524,15 @@ static const struct arch_timer_erratum_workaround
> ool_workarounds[] = {
> > .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
> > },
> > #endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > + {
> > + .match_type = ate_match_local_cap_id,
> > + .id = (void *)ARM64_WORKAROUND_MRVL_38627,
> > + .desc = "Marvell erratum 38627",
> > + .set_next_event_phys =
> erratum_38627_set_next_event_tval_phys,
> > + .set_next_event_virt = erratum_38627_set_next_event_tval_virt,
> > + },
> > +#endif
> > #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
> > {
> > .match_type = ate_match_dt,
> > --
> > 2.17.1
> >

2021-07-08 10:51:00

by Bharat Bhushan

[permalink] [raw]
Subject: RE: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Marc,

Similar questions are asked by Mark, response might be duplicated.

> -----Original Message-----
> From: Marc Zyngier <[email protected]>
> Sent: Monday, July 5, 2021 2:57 PM
> To: Bharat Bhushan <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-arm-
> [email protected]; [email protected]; Linu Cherian
> <[email protected]>
> Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround
>
> External Email
>
> ----------------------------------------------------------------------
> On Mon, 05 Jul 2021 07:08:43 +0100,
> Bharat Bhushan <[email protected]> wrote:
> >
> > CPU pipeline have unpredicted behavior when timer interrupt appears
> > and then disappears prior to the exception happening.
>
> What kind of unpredictable behaviours?

This is a race condition where an instruction (except store, system, load atomic and load exclusive) becomes "nop" if interrupt appears and disappears before taken by CPU. This can lead to GPR corruption. For example interrupt appears after the atomic load instruction starts executing and disappears before the atomic load instruction completes, in that case instruction (not all) can become "nop". As interrupt disappears before atomic instruction completes, cpu continues to execute and while take stale value from register as other dependent got "nop".

> What happens if a guest isn't aware of the erratum or actively tries to trigger it?

Errata applies to VM (EL1 virtual timer) as well. In addition extending the workaround to timer context save/restore in kvm seems to work.
Can you help if we are missing something in VM?

>
> > Time interrupt appears on timer
> > expiry and disappears when timer programming or timer disable. This
> > typically can happen when a load instruction misses in the cache,
> > which can take few hundreds of cycles, and an interrupt appears after
> > the load instruction starts executing but disappears before the load
> > instruction completes.
> >
> > Workaround of this is to ensure maximum 2us of time
>
> maximum? I'm not sure how you can bound this. Or did you mean *minimum*?

It is minimum

>
> How was this value obtained? What guarantees that it is safe?

H/w team suggested same

>
> > gap between timer interrupt and timer programming which can de-assert
> > timer interrupt.
>
> What guarantees do you have on the propagation of the interrupt signal from the
> timer to the CPU, given that the GIC is involved in the middle of it?

As per suggestion from h/w team it is sufficient delay to cover propagation delay.

>
> >
> > Signed-off-by: Linu Cherian <[email protected]>
> > Signed-off-by: Bharat Bhushan <[email protected]>
> > ---
> > arch/arm64/include/asm/cputype.h | 13 +++++++
> > arch/arm64/kernel/cpu_errata.c | 24 +++++++++++++
> > arch/arm64/tools/cpucaps | 1 +
> > drivers/clocksource/Kconfig | 13 +++++++
> > drivers/clocksource/arm_arch_timer.c | 54
> > ++++++++++++++++++++++++++++
> > 5 files changed, 105 insertions(+)
> >
> > diff --git a/arch/arm64/include/asm/cputype.h
> > b/arch/arm64/include/asm/cputype.h
> > index 6231e1f0abe7..e9a76935ee0f 100644
> > --- a/arch/arm64/include/asm/cputype.h
> > +++ b/arch/arm64/include/asm/cputype.h
> > @@ -81,6 +81,13 @@
> > #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3
> > #define CAVIUM_CPU_PART_THUNDERX2 0x0AF
> >
> > +#define MRVL_CPU_PART_OCTEONTX2_98XX 0x0B1
> > +#define MRVL_CPU_PART_OCTEONTX2_96XX 0x0B2
> > +#define MRVL_CPU_PART_OCTEONTX2_95XX 0x0B3
> > +#define MRVL_CPU_PART_OCTEONTX2_LOKI 0x0B4
> > +#define MRVL_CPU_PART_OCTEONTX2_95MM 0x0B5
> > +#define MRVL_CPU_PART_OCTEONTX2_95O 0x0B6
> > +
> > #define BRCM_CPU_PART_BRAHMA_B53 0x100
> > #define BRCM_CPU_PART_VULCAN 0x516
> >
> > @@ -117,6 +124,12 @@
> > #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX
> > MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> CAVIUM_CPU_PART_THUNDERX_83XX)
> > #define MIDR_CAVIUM_THUNDERX2
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > CAVIUM_CPU_PART_THUNDERX2)
> > +#define MIDR_MRVL_OCTEONTX2_98XX
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_98XX) #define MIDR_MRVL_OCTEONTX2_96XX
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_96XX)
> > +#define MIDR_MRVL_OCTEONTX2_95XX
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_95XX) #define MIDR_MRVL_OCTEONTX2_LOKI
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_LOKI)
> > +#define MIDR_MRVL_OCTEONTX2_95MM
> MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> > +MRVL_CPU_PART_OCTEONTX2_95MM) #define
> MIDR_MRVL_OCTEONTX2_95O
> > +MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM,
> MRVL_CPU_PART_OCTEONTX2_95O)
> > #define MIDR_BRAHMA_B53 MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM,
> > BRCM_CPU_PART_BRAHMA_B53) #define MIDR_BRCM_VULCAN
> > MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN)
> #define
> > MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM,
> > QCOM_CPU_PART_FALKOR_V1)
>
> This part should be is a separate patch.

Ok,

>
> > diff --git a/arch/arm64/kernel/cpu_errata.c
> > b/arch/arm64/kernel/cpu_errata.c index e2c20c036442..363f83adb333
> > 100644
> > --- a/arch/arm64/kernel/cpu_errata.c
> > +++ b/arch/arm64/kernel/cpu_errata.c
> > @@ -339,6 +339,23 @@ static const struct midr_range erratum_1463225[] = {
> > {},
> > };
> > #endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > +static const struct midr_range marvell_erratum_38627_cpus[] = {
> > + /* Marvell OcteonTX 2, 95xx all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95XX),
> > + /* Marvell OcteonTX 2, 95MM all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_95MM),
> > + /* Marvell OcteonTX 2, LOKI all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_LOKI),
> > + /* Marvell OcteonTX 2, 96xx all passes */
> > + MIDR_ALL_VERSIONS(MIDR_MRVL_OCTEONTX2_96XX),
> > + /* Marvell OcteonTX 2, 98xx pass 1.0 */
> > + MIDR_REV(MIDR_MRVL_OCTEONTX2_98XX, 0, 0),
> > + /* Marvell OcteonTX 2, 95O pass 1.0 */
> > + MIDR_REV(MIDR_MRVL_OCTEONTX2_95O, 0, 0),
>
> Is there any part that is *not* affected?

This is fixed in newer revision of octeaonTx2 part.

>
> > + {},
> > +};
> > +#endif
> >
> > const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef
> > CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
> > @@ -439,6 +456,13 @@ const struct arm64_cpu_capabilities arm64_errata[] =
> {
> > .capability = ARM64_WORKAROUND_858921,
> > ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
> > },
> > +#endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > + {
> > + .desc = "MARVELL erratum 38627",
> > + .capability = ARM64_WORKAROUND_MRVL_38627,
> > + ERRATA_MIDR_RANGE_LIST(marvell_erratum_38627_cpus),
> > + },
> > #endif
> > {
> > .desc = "Spectre-v2",
> > diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index
> > 49305c2e6dfd..99dbe80445ef 100644
> > --- a/arch/arm64/tools/cpucaps
> > +++ b/arch/arm64/tools/cpucaps
> > @@ -60,6 +60,7 @@ WORKAROUND_CAVIUM_TX2_219_PRFM
> > WORKAROUND_CAVIUM_TX2_219_TVM WORKAROUND_CLEAN_CACHE
> > WORKAROUND_DEVICE_LOAD_ACQUIRE
> > +WORKAROUND_MRVL_38627
> > WORKAROUND_NVIDIA_CARMEL_CNP
> > WORKAROUND_QCOM_FALKOR_E1003
> > WORKAROUND_REPEAT_TLBI
> > diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
> > index eb661b539a3e..83feb502cf02 100644
> > --- a/drivers/clocksource/Kconfig
> > +++ b/drivers/clocksource/Kconfig
> > @@ -309,6 +309,19 @@ config ARM_ARCH_TIMER_EVTSTREAM config
> > ARM_ARCH_TIMER_OOL_WORKAROUND
> > bool
> >
> > +config MARVELL_ERRATUM_38627
> > + bool "Workaround for Marvell Erratum 38627"
> > + default y
> > + depends on ARM_ARCH_TIMER && ARM64
> > + select ARM_ARCH_TIMER_OOL_WORKAROUND
> > + help
> > + This option enables a workaround for Marvell Erratum
> > + 38627. According to this errata CPU pipeline have
> > + unpredicted behavior when timer interrupt appears and
> > + then disappears prior to the exception happening.
>
> This wording is actually a lot clearer than the commit message. What happens if a
> guest plays with the timer?
>
> > + This Errata workaround is applicable only to some Marvell
> > + OcteonTX2 series of processors.
> > +
> > config FSL_ERRATUM_A008585
> > bool "Workaround for Freescale/NXP Erratum A-008585"
> > default y
> > diff --git a/drivers/clocksource/arm_arch_timer.c
> > b/drivers/clocksource/arm_arch_timer.c
> > index be6d741d404c..d1c5e2aa2e7c 100644
> > --- a/drivers/clocksource/arm_arch_timer.c
> > +++ b/drivers/clocksource/arm_arch_timer.c
> > @@ -27,6 +27,7 @@
> > #include <linux/acpi.h>
> > #include <linux/arm-smccc.h>
> > #include <linux/ptp_kvm.h>
> > +#include <linux/delay.h>
> >
> > #include <asm/arch_timer.h>
> > #include <asm/virt.h>
> > @@ -88,6 +89,8 @@ static enum vdso_clock_mode vdso_default =
> > VDSO_CLOCKMODE_NONE;
> >
> > static cpumask_t evtstrm_available = CPU_MASK_NONE; static bool
> > evtstrm_enable __ro_after_init =
> > IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);
> > +static __always_inline void set_next_event(const int access, unsigned long evt,
> > + struct clock_event_device *clk);
> >
> > static int __init early_evtstrm_cfg(char *buf) { @@ -432,6 +435,48
> > @@ static __maybe_unused int erratum_set_next_event_tval_phys(unsigned
> long evt,
> > return 0;
> > }
> >
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > +/* Workaround is to ensure maximum 2us of time gap between timer
> > +expiry
> > + * and timer programming which can de-assert timer interrupt.
> > + * Time calculation below is based on 100MHz as timer frequency is
> > +fixed
> > + * to 100MHz on all affected parts.
> > + */
>
> Please fix the comment style.
>
> > +static __always_inline
>
> Please drop this __always_inline. There is no hard requirement for it, and you
> are about to *wait*, so waiting faster is... not that interesting. The compiler will
> figure it out.

Ok,

>
> > +void erratum_38627_set_next_event(const int access, unsigned long evt,
> > + struct clock_event_device *clk) {
> > + int32_t tval;
> > +
> > + tval = arch_timer_reg_read(access, ARCH_TIMER_REG_TVAL, clk);
> > +
> > + /* Timer already expired, wait for (2 - expired time)us */
> > + if ((tval > -200) && (tval < 0))
> > + udelay(2 + tval/100);
> > +
> > + /* Timer is about to expire, wait for 2us + time to expire */
> > + if (tval >= 0 && tval < 200)
> > + udelay(3 + tval/100);
>
> I read 3us here, which contradicts the comment. Also, even if you wait for an
> arbitrary amount of time, I still don't see what guarantees you may have that the
> interrupt will have actually be signalled.

Yes, minimum 2us,

If "0 < tval < 100", in that case we want to wait 2us after tval expiry, so that turn out to be 2us + 1us (round up of 0 < tval < 100) = 3us.
For 99 < tval < 200, in that case we want to wait 2us after tval expiry, In that case that turn out to be 2us + 2us (round up of 99 < tval < 200) = 4us.

>
> > +
> > + set_next_event(access, evt, clk);
> > +}
> > +
> > +static __maybe_unused
> > +int erratum_38627_set_next_event_tval_virt(unsigned long evt,
> > + struct clock_event_device *clk) {
> > + erratum_38627_set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
> > + return 0;
> > +}
> > +
> > +static __maybe_unused
> > +int erratum_38627_set_next_event_tval_phys(unsigned long evt,
> > + struct clock_event_device *clk) {
> > + erratum_38627_set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
> > + return 0;
> > +}
> > +#endif
> > +
> > static const struct arch_timer_erratum_workaround ool_workarounds[] =
> > { #ifdef CONFIG_FSL_ERRATUM_A008585
> > {
> > @@ -479,6 +524,15 @@ static const struct arch_timer_erratum_workaround
> ool_workarounds[] = {
> > .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
> > },
> > #endif
> > +#ifdef CONFIG_MARVELL_ERRATUM_38627
> > + {
> > + .match_type = ate_match_local_cap_id,
> > + .id = (void *)ARM64_WORKAROUND_MRVL_38627,
> > + .desc = "Marvell erratum 38627",
> > + .set_next_event_phys =
> erratum_38627_set_next_event_tval_phys,
> > + .set_next_event_virt = erratum_38627_set_next_event_tval_virt,
> > + },
> > +#endif
> > #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
> > {
> > .match_type = ate_match_dt,
>
> You need to update Documentation/arm64/silicon-errata.rst.

Will do,

>
> Overall, this patch raises *a lot* of questions...

Looking forward to understand all possible cases we need to extend workaround.

>
> M.
>
> --
> Without deviation from the norm, progress is not possible.

2021-07-08 11:43:28

by Mark Rutland

[permalink] [raw]
Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

On Thu, Jul 08, 2021 at 10:47:42AM +0000, Bharat Bhushan wrote:
> Hi Mark,
>
> Sorry for the delay, was gathering some details.
> Pease see inline
>
> > -----Original Message-----
> > From: Mark Rutland <[email protected]>
> > Sent: Monday, July 5, 2021 2:38 PM
> > To: Bharat Bhushan <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; linux-arm-
> > [email protected]; [email protected]; Linu Cherian
> > <[email protected]>
> > Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > Hi Bharat,
> >
> > On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > > CPU pipeline have unpredicted behavior when timer interrupt appears
> > > and then disappears prior to the exception happening. Time interrupt
> > > appears on timer expiry and disappears when timer programming or timer
> > > disable. This typically can happen when a load instruction misses in
> > > the cache, which can take few hundreds of cycles, and an interrupt
> > > appears after the load instruction starts executing but disappears
> > > before the load instruction completes.
> >
> > Could you elaborate on the scenario? What sort of unpredictable behaviour can
> > occur? e.g:
>
> This is a race condition where an instruction (except store, system,
> load atomic and load exclusive) becomes "nop" if interrupt appears and
> disappears before taken by CPU. For example interrupt appears after
> the atomic load instruction starts executing and disappears before the
> atomic load instruction completes, in that case instruction (not all)
> can become "nop". As interrupt disappears before atomic instruction
> completes, cpu continues to execute and while take junk from register
> as other dependent got "nop".

Thanks for this; I have a number of further questions below.

You said this doesn't apply to:

* store
* system
* load atomic
* load exclusive

... but your example explains this happening for an atomic load, which
was in that list. Was the example bad, or was the list wrong?

It's not entirely clear to me which instructions this covers. e.g. is
"system" the entire system instruction class (i.e. all opcodes
0b110101010_0xxxxxxx_xxxxxxxx_xxxxxxxx), or did you mean something more
specific? Does "store" include store-exlcusive?

Other than that list, can this occur for *any* instruction? e.g. MOV,
SHA256*, *DIV?

Does this only apply to a single instruction at a time, or can multiple
instructions "become nop"?

When an instruction "becomes nop", will subsequent instructions see a
consistent architectural state (e.g. GPRs as they were exactly before
the instruction which "becomes nop"), or can they see something else
(e.g. garbage forwarded from register renaming or other internal
microarchitectural state)?

> > * Does the CPU lockup?
> No
>
> > * Does the CPU take the exception at all?
> No
>
> > * Does the load behave erroneously?
> No,
>
> > * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?
>
> yes, GPRs will get corrupted, will have stale value

As above, is that the prior architectural value of the GPRs, or can that
be some bogus microarchitectural state (e.g. from renaming or other
forwarding paths)?

> > Does the problem manifest when IRQs are masked by DAIF.I, or by
> > CNT*_CTL_EL0.{IMASK,ENABLE} ?
>
> No, there are no issue if interrupts are masked.

If a write to CNTV_CTL_EL0.IMASK races with the interrupt being
asserted, can that trigger the problem?

If a write to DAIF.I races with the interrupt being asserted, can that
trigger the problem?

From your description so far, this doesn't sound like it is specific to
the timer interrupt. Is it possible for a different interrupt to trigger
this, e.g:

* Can the same happen with another PPI, e.g. the PMU interrupt if that
gets de-asserted, or there's a race with DAIF.I?

* Can the same happen with an SGI, e.g. if one CPU asserts then
de-asserts an SGI targetting another CPU, or there's a race with
DAIF.I?

* Can the same happen with an SPI, e.g. if a device asserts then
de-asserts its IRQ line, or there's a race with DAIF.I?

If not, *why* does this happen specifically for the timer interrupt?

> > > Workaround of this is to ensure maximum 2us of time gap between timer
> > > interrupt and timer programming which can de-assert timer interrupt.
> >
> > The code below seems to try to enforce a 2us *minimum*. Which is it supposed
> > to be?
>
> Yes, it is minimum 2us.
>
> >
> > Can you explain *why* this is supposed to help?
> With the workaround interrupt assertion and de-assertion will be minimum 2us apart.

I understood that, but why is that deemed to be sufficient? e.g. is it
somehow guaranteed that the CPU will complete the instruction that would
"become nop" in that time?

> > I don't see how we can guarantee this in a VM, or if the CPU misses on an
> > instruction fetch.
>
> This errata applies to VM (virtual timer) as well, maybe there is some
> gap in my understanding, how it will be different in VM.
> Can you help with what issue we can have VM?

A VCPU can be pre-empted by the host at *any* time, for an arbitrary
length of time. So e.g. you can have a scenario such as:

1. Guest reads CNTx_TVAL, sees interrupt is 4us in the future and
decides it does not need to wait
2. Host preempts guest
3. Host does some processing for ~3.9us
4. Host returns to guest, with 0.1us left until the interrupt triggers
5. Guest reprograms CNTx_TVAL, and triggers the erratum

Thanks,
Mark.

2021-07-11 09:58:31

by Marc Zyngier

[permalink] [raw]
Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

On Thu, 08 Jul 2021 11:48:18 +0100,
Bharat Bhushan <[email protected]> wrote:
>
> Hi Marc,
>
> Similar questions are asked by Mark, response might be duplicated.

Mark had a ton of very good questions, so I won't repeat them. Some
more below though:

> > -----Original Message-----
> > From: Marc Zyngier <[email protected]>
> > Sent: Monday, July 5, 2021 2:57 PM
> > To: Bharat Bhushan <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; linux-arm-
> > [email protected]; [email protected]; Linu Cherian
> > <[email protected]>
> > Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround
> >
> > External Email
> >
> > ----------------------------------------------------------------------
> > On Mon, 05 Jul 2021 07:08:43 +0100,
> > Bharat Bhushan <[email protected]> wrote:
> > >
> > > CPU pipeline have unpredicted behavior when timer interrupt appears
> > > and then disappears prior to the exception happening.
> >
> > What kind of unpredictable behaviours?
>
> This is a race condition where an instruction (except store, system,
> load atomic and load exclusive) becomes "nop" if interrupt appears
> and disappears before taken by CPU. This can lead to GPR
> corruption. For example interrupt appears after the atomic load
> instruction starts executing and disappears before the atomic load
> instruction completes, in that case instruction (not all) can become
> "nop". As interrupt disappears before atomic instruction completes,
> cpu continues to execute and while take stale value from register as
> other dependent got "nop".

So here's what I understand from the above:

- Interrupts being a context synchronisation event, the CPU deals with
them by preventing in-flight instructions from having any effect
(what you above describe as becoming NOP).

- If the interrupt is recalled before the exception entry can take
place, the exception doesn't occur, but the discarded instructions
are not replayed, leaving the program in an inconsistent state.

Is this interpretation correct? If so, I have more questions:

- Does the erratum trigger when interrupts are masked in PSTATE? Can
this erratum be triggered by masking interrupts in PSTATE?

- What makes this specific to the timer? Why can't this be triggered
with any other interrupt? Spurious interrupts do exist, and happen
all the time, specially with level triggered interrupts.

- What if *another* CPU masks the interrupt at the GIC redistributor
level?

> > What happens if a guest isn't aware of the erratum or actively
> > tries to trigger it?
>
> Errata applies to VM (EL1 virtual timer) as well. In addition
> extending the workaround to timer context save/restore in kvm seems
> to work. Can you help if we are missing something in VM?

Maybe. First, I want to understand why this is specific to the timer,
and whether this can have any impact when already in an exception
context. I'm not convinced that this issue is specific to the timer
either.

Which revision of the architecture does this CPU implements? Depending
on whether the CPU runs VHE or not, we handle things slightly differently.

> > > Time interrupt appears on timer
> > > expiry and disappears when timer programming or timer disable. This
> > > typically can happen when a load instruction misses in the cache,
> > > which can take few hundreds of cycles, and an interrupt appears after
> > > the load instruction starts executing but disappears before the load
> > > instruction completes.
> > >
> > > Workaround of this is to ensure maximum 2us of time
> >
> > maximum? I'm not sure how you can bound this. Or did you mean *minimum*?
>
> It is minimum
>
> >
> > How was this value obtained? What guarantees that it is safe?
>
> H/w team suggested same

This doesn't answer my question.

Thanks,

M.

--
Without deviation from the norm, progress is not possible.

2021-07-13 02:42:27

by Bharat Bhushan

[permalink] [raw]
Subject: RE: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Mark,

> -----Original Message-----
> From: Mark Rutland <[email protected]>
> Sent: Thursday, July 8, 2021 5:12 PM
> To: Bharat Bhushan <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-arm-
> [email protected]; [email protected]; Linu Cherian
> <[email protected]>
> Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> workaround
>
> On Thu, Jul 08, 2021 at 10:47:42AM +0000, Bharat Bhushan wrote:
> > Hi Mark,
> >
> > Sorry for the delay, was gathering some details.
> > Pease see inline
> >
> > > -----Original Message-----
> > > From: Mark Rutland <[email protected]>
> > > Sent: Monday, July 5, 2021 2:38 PM
> > > To: Bharat Bhushan <[email protected]>
> > > Cc: [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; linux-arm- [email protected];
> > > [email protected]; Linu Cherian <[email protected]>
> > > Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> > > workaround
> > >
> > > External Email
> > >
> > > --------------------------------------------------------------------
> > > --
> > > Hi Bharat,
> > >
> > > On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > > > CPU pipeline have unpredicted behavior when timer interrupt
> > > > appears and then disappears prior to the exception happening. Time
> > > > interrupt appears on timer expiry and disappears when timer
> > > > programming or timer disable. This typically can happen when a
> > > > load instruction misses in the cache, which can take few hundreds
> > > > of cycles, and an interrupt appears after the load instruction
> > > > starts executing but disappears before the load instruction completes.
> > >
> > > Could you elaborate on the scenario? What sort of unpredictable
> > > behaviour can occur? e.g:
> >
> > This is a race condition where an instruction (except store, system,
> > load atomic and load exclusive) becomes "nop" if interrupt appears and
> > disappears before taken by CPU. For example interrupt appears after
> > the atomic load instruction starts executing and disappears before the
> > atomic load instruction completes, in that case instruction (not all)
> > can become "nop". As interrupt disappears before atomic instruction
> > completes, cpu continues to execute and while take junk from register
> > as other dependent got "nop".
>
> Thanks for this; I have a number of further questions below.
>
> You said this doesn't apply to:
>
> * store
> * system
> * load atomic
> * load exclusive
>
> ... but your example explains this happening for an atomic load, which was in
> that list. Was the example bad, or was the list wrong?

The load atomic completes successfully. It doesn't become a nop. A loads atomic is significant just because it's an instruction which has a long time between executing an retiring. This provides a window of vulnerability when an interrupt asserts and then deasserts. This stimulates the bug and causes other instructions executing in parallel, which can get nop.

>
> It's not entirely clear to me which instructions this covers. e.g. is "system" the
> entire system instruction class (i.e. all opcodes
> 0b110101010_0xxxxxxx_xxxxxxxx_xxxxxxxx), or did you mean something more
> specific? Does "store" include store-exlcusive?
>
> Other than that list, can this occur for *any* instruction? e.g. MOV, SHA256*,
> *DIV?

There are two general classes of instructions. Those that only change a gpr or PC. These are arithmetic, floating point, branch. Loads with no side effects also fall into this category. These are the instructions that can erroneously be nop'd. The other category are instructions that can change architectural state more than a GPR. These include all stores, atomic loads, exclusive loads, loads to non-cacheable space,msr,mrs,eret,tlb*,sys,brk,etc, these does not get "noped"

>
> Does this only apply to a single instruction at a time, or can multiple instructions
> "become nop"?

Can be multiple,

>
> When an instruction "becomes nop", will subsequent instructions see a
> consistent architectural state (e.g. GPRs as they were exactly before the
> instruction which "becomes nop"), or can they see something else (e.g. garbage
> forwarded from register renaming or other internal microarchitectural state)?



>
> > > * Does the CPU lockup?
> > No
> >
> > > * Does the CPU take the exception at all?
> > No
> >
> > > * Does the load behave erroneously?
> > No,
> >
> > > * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?
> >
> > yes, GPRs will get corrupted, will have stale value
>
> As above, is that the prior architectural value of the GPRs, or can that be some
> bogus microarchitectural state (e.g. from renaming or other forwarding paths)?

The instructions that become a nop doesn't write the GPR and because this is an OOO machine the GPR result isn't the prior architectural value but whatever crap is leftover in the physical register.

>
> > > Does the problem manifest when IRQs are masked by DAIF.I, or by
> > > CNT*_CTL_EL0.{IMASK,ENABLE} ?
> >
> > No, there are no issue if interrupts are masked.
>
> If a write to CNTV_CTL_EL0.IMASK races with the interrupt being asserted, can
> that trigger the problem?

If interrupt is enabled (DAIF) - then it will be taken, and no issue
But if interrupts are disabled then following sequence can see the race
1) interrupt is disabled (DAIF)
2) TVAL/ENABLE/IMASK at timer h/w programming to de-assert interrupt.
Race of Irq asserted before irq de-asserted, than this short window of assertion will be considered as spike from timer h/w block
3) Enable DAIF
Because of propagation delay CPU sees assertion and de-assertion (spike), errata hit

Will add "isb" around interrupt enablement in next version of patch.

>
> If a write to DAIF.I races with the interrupt being asserted, can that trigger the
> problem?

No race with writing to DAIF.I with interrupt assertion,
Writing DAIF.I = 0 (enablement of interrupt) can race with de-assertion, which can lead to hitting errata


>
> From your description so far, this doesn't sound like it is specific to the timer
> interrupt. Is it possible for a different interrupt to trigger this, e.g:
>
> * Can the same happen with another PPI, e.g. the PMU interrupt if that
> gets de-asserted, or there's a race with DAIF.I?
>
> * Can the same happen with an SGI, e.g. if one CPU asserts then
> de-asserts an SGI targetting another CPU, or there's a race with
> DAIF.I?
>
> * Can the same happen with an SPI, e.g. if a device asserts then
> de-asserts its IRQ line, or there's a race with DAIF.I?

No issue with edge triggered, but this can happen with any level sensitive interrupt.

>
> If not, *why* does this happen specifically for the timer interrupt?
>
> > > > Workaround of this is to ensure maximum 2us of time gap between
> > > > timer interrupt and timer programming which can de-assert timer interrupt.
> > >
> > > The code below seems to try to enforce a 2us *minimum*. Which is it
> > > supposed to be?
> >
> > Yes, it is minimum 2us.
> >
> > >
> > > Can you explain *why* this is supposed to help?
> > With the workaround interrupt assertion and de-assertion will be minimum 2us
> apart.
>
> I understood that, but why is that deemed to be sufficient? e.g. is it somehow
> guaranteed that the CPU will complete the instruction that would "become nop"
> in that time?

With this delay we avoid spike, either this this will becomes an actual interrupt or the spike never visible to core.

>
> > > I don't see how we can guarantee this in a VM, or if the CPU misses
> > > on an instruction fetch.
> >
> > This errata applies to VM (virtual timer) as well, maybe there is some
> > gap in my understanding, how it will be different in VM.
> > Can you help with what issue we can have VM?
>
> A VCPU can be pre-empted by the host at *any* time, for an arbitrary length of
> time. So e.g. you can have a scenario such as:
>
> 1. Guest reads CNTx_TVAL, sees interrupt is 4us in the future and
> decides it does not need to wait
> 2. Host preempts guest
> 3. Host does some processing for ~3.9us
> 4. Host returns to guest, with 0.1us left until the interrupt triggers 5. Guest
> reprograms CNTx_TVAL, and triggers the erratum

Yes, when timer expire just before tval written (race condition) , so there is assertion-followed by de-assertion, As interrupts are enabled in host, interrupt will be visible as spike to host.
We will apply workaround whenever entering to guest (add a delay before exiting to guest in case guest timer is going to expire).

Thanks
-Bharat

>
> Thanks,
> Mark.

2021-07-13 15:41:34

by Marc Zyngier

[permalink] [raw]
Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Bharat,

On Tue, 13 Jul 2021 03:40:22 +0100,
Bharat Bhushan <[email protected]> wrote:
>
> Hi Mark,
>
> -----Original Message-----
> From: Mark Rutland <[email protected]>

[...]

> > From your description so far, this doesn't sound like it is
> > specific to the timer interrupt. Is it possible for a different
> > interrupt to trigger this, e.g:
> >
> > * Can the same happen with another PPI, e.g. the PMU interrupt if that
> > gets de-asserted, or there's a race with DAIF.I?
> >
> > * Can the same happen with an SGI, e.g. if one CPU asserts then
> > de-asserts an SGI targetting another CPU, or there's a race with
> > DAIF.I?
> >
> > * Can the same happen with an SPI, e.g. if a device asserts then
> > de-asserts its IRQ line, or there's a race with DAIF.I?
>
> No issue with edge triggered, but this can happen with any level
> sensitive interrupt.

So let's say CPU0 is targeted by a level-triggered SPI, and right when
the interrupt is reaching the CPU interface, CPU1 disables this
interrupt, which gets recalled, and CPU0 never takes the interrupt.
Bug hits again. Drivers do that.

I actually suspect that an edge-triggered interrupt would result in
the same issue, unless your GIC implementation isn't able to perform a
recall on edge interrupts.

I don't understand why you are only considering the timer here. Any
interrupt can trigger this, and if there is going to be a workaround,
it will need to be robust against all interrupts being retired, no
matter what device triggers it.

And given that the OoO nature of the machine leaks non architectural
state, potentially belonging to a different security context, this
isn't something that should be taken lightly.

M.

--
Without deviation from the norm, progress is not possible.

2021-07-13 16:13:44

by Mark Rutland

[permalink] [raw]
Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

On Tue, Jul 13, 2021 at 02:40:22AM +0000, Bharat Bhushan wrote:
> Hi Mark,
>
> > -----Original Message-----
> > From: Mark Rutland <[email protected]>
> > Sent: Thursday, July 8, 2021 5:12 PM
> > To: Bharat Bhushan <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; linux-arm-
> > [email protected]; [email protected]; Linu Cherian
> > <[email protected]>
> > Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> > workaround
> >
> > On Thu, Jul 08, 2021 at 10:47:42AM +0000, Bharat Bhushan wrote:
> > > Hi Mark,
> > >
> > > Sorry for the delay, was gathering some details.
> > > Pease see inline
> > >
> > > > -----Original Message-----
> > > > From: Mark Rutland <[email protected]>
> > > > Sent: Monday, July 5, 2021 2:38 PM
> > > > To: Bharat Bhushan <[email protected]>
> > > > Cc: [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; [email protected];
> > > > [email protected]; [email protected]; [email protected];
> > > > [email protected]; linux-arm- [email protected];
> > > > [email protected]; Linu Cherian <[email protected]>
> > > > Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> > > > workaround
> > > >
> > > > External Email
> > > >
> > > > --------------------------------------------------------------------
> > > > --
> > > > Hi Bharat,
> > > >
> > > > On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > > > > CPU pipeline have unpredicted behavior when timer interrupt
> > > > > appears and then disappears prior to the exception happening. Time
> > > > > interrupt appears on timer expiry and disappears when timer
> > > > > programming or timer disable. This typically can happen when a
> > > > > load instruction misses in the cache, which can take few hundreds
> > > > > of cycles, and an interrupt appears after the load instruction
> > > > > starts executing but disappears before the load instruction completes.
> > > >
> > > > Could you elaborate on the scenario? What sort of unpredictable
> > > > behaviour can occur? e.g:
> > >
> > > This is a race condition where an instruction (except store, system,
> > > load atomic and load exclusive) becomes "nop" if interrupt appears and
> > > disappears before taken by CPU. For example interrupt appears after
> > > the atomic load instruction starts executing and disappears before the
> > > atomic load instruction completes, in that case instruction (not all)
> > > can become "nop". As interrupt disappears before atomic instruction
> > > completes, cpu continues to execute and while take junk from register
> > > as other dependent got "nop".
> >
> > Thanks for this; I have a number of further questions below.
> >
> > You said this doesn't apply to:
> >
> > * store
> > * system
> > * load atomic
> > * load exclusive
> >
> > ... but your example explains this happening for an atomic load, which was in
> > that list. Was the example bad, or was the list wrong?
>
> The load atomic completes successfully. It doesn't become a nop. A
> loads atomic is significant just because it's an instruction which has
> a long time between executing an retiring. This provides a window of
> vulnerability when an interrupt asserts and then deasserts. This
> stimulates the bug and causes other instructions executing in
> parallel, which can get nop.

Thanks for clarifiying; this was not clear from your initial
description.

> > It's not entirely clear to me which instructions this covers. e.g. is "system" the
> > entire system instruction class (i.e. all opcodes
> > 0b110101010_0xxxxxxx_xxxxxxxx_xxxxxxxx), or did you mean something more
> > specific? Does "store" include store-exlcusive?
> >
> > Other than that list, can this occur for *any* instruction? e.g. MOV, SHA256*,
> > *DIV?
>
> There are two general classes of instructions. Those that only change
> a gpr or PC. These are arithmetic, floating point, branch. Loads with
> no side effects also fall into this category. These are the
> instructions that can erroneously be nop'd. The other category are
> instructions that can change architectural state more than a GPR.
> These include all stores, atomic loads, exclusive loads, loads to
> non-cacheable space,msr,mrs,eret,tlb*,sys,brk,etc, these does not get
> "noped"
>
> >
> > Does this only apply to a single instruction at a time, or can multiple instructions
> > "become nop"?
>
> Can be multiple,
>
> >
> > When an instruction "becomes nop", will subsequent instructions see a
> > consistent architectural state (e.g. GPRs as they were exactly before the
> > instruction which "becomes nop"), or can they see something else (e.g. garbage
> > forwarded from register renaming or other internal microarchitectural state)?
>
> > > > * Does the CPU lockup?
> > > No
> > >
> > > > * Does the CPU take the exception at all?
> > > No
> > >
> > > > * Does the load behave erroneously?
> > > No,
> > >
> > > > * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?
> > >
> > > yes, GPRs will get corrupted, will have stale value
> >
> > As above, is that the prior architectural value of the GPRs, or can that be some
> > bogus microarchitectural state (e.g. from renaming or other forwarding paths)?
>
> The instructions that become a nop doesn't write the GPR and because
> this is an OOO machine the GPR result isn't the prior architectural
> value but whatever crap is leftover in the physical register.

Ok, so that's a potential information leak from a different context
(e.g. higher EL), depending on what happened to be left in that physical
register.

Consider a malicious guest at EL1. What prevents it from triggering this
deliberately, then inspecting the GPRs after taking the IRQ in order to
read prior secrets?

> > > > Does the problem manifest when IRQs are masked by DAIF.I, or by
> > > > CNT*_CTL_EL0.{IMASK,ENABLE} ?
> > >
> > > No, there are no issue if interrupts are masked.
> >
> > If a write to CNTV_CTL_EL0.IMASK races with the interrupt being asserted, can
> > that trigger the problem?
>
> If interrupt is enabled (DAIF) - then it will be taken, and no issue
> But if interrupts are disabled then following sequence can see the race
> 1) interrupt is disabled (DAIF)
> 2) TVAL/ENABLE/IMASK at timer h/w programming to de-assert interrupt.
> Race of Irq asserted before irq de-asserted, than this short window of assertion will be considered as spike from timer h/w block
> 3) Enable DAIF
> Because of propagation delay CPU sees assertion and de-assertion (spike), errata hit
>
> Will add "isb" around interrupt enablement in next version of patch.

... why?

That doesn't seem to follow from the abvoe, so I think I'm missing a
step.

> > If a write to DAIF.I races with the interrupt being asserted, can that trigger the
> > problem?
>
> No race with writing to DAIF.I with interrupt assertion,
> Writing DAIF.I = 0 (enablement of interrupt) can race with de-assertion, which can lead to hitting errata

Ok.

That *might* make it possible to bodge around the timer specifically,
but as below I don't think we can ensure this is safe in the presence of
virtualization, nor when considering other interrupts.

> > From your description so far, this doesn't sound like it is specific to the timer
> > interrupt. Is it possible for a different interrupt to trigger this, e.g:
> >
> > * Can the same happen with another PPI, e.g. the PMU interrupt if that
> > gets de-asserted, or there's a race with DAIF.I?
> >
> > * Can the same happen with an SGI, e.g. if one CPU asserts then
> > de-asserts an SGI targetting another CPU, or there's a race with
> > DAIF.I?
> >
> > * Can the same happen with an SPI, e.g. if a device asserts then
> > de-asserts its IRQ line, or there's a race with DAIF.I?
>
> No issue with edge triggered, but this can happen with any level sensitive interrupt.

Ok.

So that'll include at least the PMU and

> > If not, *why* does this happen specifically for the timer interrupt?
> >
> > > > > Workaround of this is to ensure maximum 2us of time gap between
> > > > > timer interrupt and timer programming which can de-assert timer interrupt.
> > > >
> > > > The code below seems to try to enforce a 2us *minimum*. Which is it
> > > > supposed to be?
> > >
> > > Yes, it is minimum 2us.
> > >
> > > >
> > > > Can you explain *why* this is supposed to help?
> > > With the workaround interrupt assertion and de-assertion will be minimum 2us
> > apart.
> >
> > I understood that, but why is that deemed to be sufficient? e.g. is it somehow
> > guaranteed that the CPU will complete the instruction that would "become nop"
> > in that time?
>
> With this delay we avoid spike, either this this will becomes an
> actual interrupt or the spike never visible to core.
>
> > > > I don't see how we can guarantee this in a VM, or if the CPU misses
> > > > on an instruction fetch.
> > >
> > > This errata applies to VM (virtual timer) as well, maybe there is some
> > > gap in my understanding, how it will be different in VM.
> > > Can you help with what issue we can have VM?
> >
> > A VCPU can be pre-empted by the host at *any* time, for an arbitrary length of
> > time. So e.g. you can have a scenario such as:
> >
> > 1. Guest reads CNTx_TVAL, sees interrupt is 4us in the future and
> > decides it does not need to wait
> > 2. Host preempts guest
> > 3. Host does some processing for ~3.9us
> > 4. Host returns to guest, with 0.1us left until the interrupt triggers 5. Guest
> > reprograms CNTx_TVAL, and triggers the erratum
>
> Yes, when timer expire just before tval written (race condition) , so
> there is assertion-followed by de-assertion, As interrupts are enabled
> in host, interrupt will be visible as spike to host.

Ok, so that's a recipe for the guest to attack the host.

> We will apply workaround whenever entering to guest (add a delay
> before exiting to guest in case guest timer is going to expire).

I think this is papering over the problem.

You've said this can happen for *any* level-triggered interrupt. AFAIK,
nothing prevents a malicious guest from deliberately asserting and
de-asserting a level-triggered interrupt (e.g. by writing to the GIC
distributor), and I also note that the GIC maintenance interrupt is
level-triggered.

So, as above:

1) A guest can deliberately cause information to be leaked to itself via
the corrupted GPRs. I haven't seen any rationale for why that is not
a problem, nor have I seen a suggested workaround.

2) A guest *may* be able to trigger this while the host is running. I
haven't seen anything that rules this out so far.

3) Even in the absence of virtualization, it would be necessary to
workaround this for *every* level-triggered interrupt, which includes
at the timer, PMU, and GIC maintenance interrupts, in addition to any
other configurable PPIs or SPIs.

Without a fix that covers all of those, I don't think the workaround is
viable.

Thanks,
Mark.

2021-07-26 04:31:32

by Bharat Bhushan

[permalink] [raw]
Subject: RE: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Sorry for delayed response

Please see inline

> -----Original Message-----
> From: Mark Rutland <[email protected]>
> Sent: Tuesday, July 13, 2021 9:43 PM
> To: Bharat Bhushan <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-arm-
> [email protected]; [email protected]; Linu Cherian
> <[email protected]>; Sunil Kovvuri Goutham <[email protected]>
> Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> workaround
>
> On Tue, Jul 13, 2021 at 02:40:22AM +0000, Bharat Bhushan wrote:
> > Hi Mark,
> >
> > > -----Original Message-----
> > > From: Mark Rutland <[email protected]>
> > > Sent: Thursday, July 8, 2021 5:12 PM
> > > To: Bharat Bhushan <[email protected]>
> > > Cc: [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected];
> > > [email protected]; [email protected]; [email protected];
> > > [email protected]; linux-arm- [email protected];
> > > [email protected]; Linu Cherian <[email protected]>
> > > Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> > > workaround
> > >
> > > On Thu, Jul 08, 2021 at 10:47:42AM +0000, Bharat Bhushan wrote:
> > > > Hi Mark,
> > > >
> > > > Sorry for the delay, was gathering some details.
> > > > Pease see inline
> > > >
> > > > > -----Original Message-----
> > > > > From: Mark Rutland <[email protected]>
> > > > > Sent: Monday, July 5, 2021 2:38 PM
> > > > > To: Bharat Bhushan <[email protected]>
> > > > > Cc: [email protected]; [email protected];
> > > > > [email protected]; [email protected];
> > > > > [email protected]; [email protected];
> > > > > [email protected]; [email protected]; [email protected];
> > > > > [email protected]; linux-arm- [email protected];
> > > > > [email protected]; Linu Cherian
> > > > > <[email protected]>
> > > > > Subject: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627
> > > > > workaround
> > > > >
> > > > > External Email
> > > > >
> > > > > ----------------------------------------------------------------
> > > > > ----
> > > > > --
> > > > > Hi Bharat,
> > > > >
> > > > > On Mon, Jul 05, 2021 at 11:38:43AM +0530, Bharat Bhushan wrote:
> > > > > > CPU pipeline have unpredicted behavior when timer interrupt
> > > > > > appears and then disappears prior to the exception happening.
> > > > > > Time interrupt appears on timer expiry and disappears when
> > > > > > timer programming or timer disable. This typically can happen
> > > > > > when a load instruction misses in the cache, which can take
> > > > > > few hundreds of cycles, and an interrupt appears after the
> > > > > > load instruction starts executing but disappears before the load
> instruction completes.
> > > > >
> > > > > Could you elaborate on the scenario? What sort of unpredictable
> > > > > behaviour can occur? e.g:
> > > >
> > > > This is a race condition where an instruction (except store,
> > > > system, load atomic and load exclusive) becomes "nop" if interrupt
> > > > appears and disappears before taken by CPU. For example interrupt
> > > > appears after the atomic load instruction starts executing and
> > > > disappears before the atomic load instruction completes, in that
> > > > case instruction (not all) can become "nop". As interrupt
> > > > disappears before atomic instruction completes, cpu continues to
> > > > execute and while take junk from register as other dependent got "nop".
> > >
> > > Thanks for this; I have a number of further questions below.
> > >
> > > You said this doesn't apply to:
> > >
> > > * store
> > > * system
> > > * load atomic
> > > * load exclusive
> > >
> > > ... but your example explains this happening for an atomic load,
> > > which was in that list. Was the example bad, or was the list wrong?
> >
> > The load atomic completes successfully. It doesn't become a nop. A
> > loads atomic is significant just because it's an instruction which has
> > a long time between executing an retiring. This provides a window of
> > vulnerability when an interrupt asserts and then deasserts. This
> > stimulates the bug and causes other instructions executing in
> > parallel, which can get nop.
>
> Thanks for clarifiying; this was not clear from your initial description.
>
> > > It's not entirely clear to me which instructions this covers. e.g.
> > > is "system" the entire system instruction class (i.e. all opcodes
> > > 0b110101010_0xxxxxxx_xxxxxxxx_xxxxxxxx), or did you mean something
> > > more specific? Does "store" include store-exlcusive?
> > >
> > > Other than that list, can this occur for *any* instruction? e.g.
> > > MOV, SHA256*, *DIV?
> >
> > There are two general classes of instructions. Those that only change
> > a gpr or PC. These are arithmetic, floating point, branch. Loads with
> > no side effects also fall into this category. These are the
> > instructions that can erroneously be nop'd. The other category are
> > instructions that can change architectural state more than a GPR.
> > These include all stores, atomic loads, exclusive loads, loads to
> > non-cacheable space,msr,mrs,eret,tlb*,sys,brk,etc, these does not get
> > "noped"
> >
> > >
> > > Does this only apply to a single instruction at a time, or can
> > > multiple instructions "become nop"?
> >
> > Can be multiple,
> >
> > >
> > > When an instruction "becomes nop", will subsequent instructions see
> > > a consistent architectural state (e.g. GPRs as they were exactly
> > > before the instruction which "becomes nop"), or can they see
> > > something else (e.g. garbage forwarded from register renaming or other
> internal microarchitectural state)?
> >
> > > > > * Does the CPU lockup?
> > > > No
> > > >
> > > > > * Does the CPU take the exception at all?
> > > > No
> > > >
> > > > > * Does the load behave erroneously?
> > > > No,
> > > >
> > > > > * Does any CPU state (e.g. GPRs, PC, PSTATE) become corrupted?
> > > >
> > > > yes, GPRs will get corrupted, will have stale value
> > >
> > > As above, is that the prior architectural value of the GPRs, or can
> > > that be some bogus microarchitectural state (e.g. from renaming or other
> forwarding paths)?
> >
> > The instructions that become a nop doesn't write the GPR and because
> > this is an OOO machine the GPR result isn't the prior architectural
> > value but whatever crap is leftover in the physical register.
>
> Ok, so that's a potential information leak from a different context (e.g. higher EL),
> depending on what happened to be left in that physical register.
>
> Consider a malicious guest at EL1. What prevents it from triggering this
> deliberately, then inspecting the GPRs after taking the IRQ in order to read prior
> secrets?
>
> > > > > Does the problem manifest when IRQs are masked by DAIF.I, or by
> > > > > CNT*_CTL_EL0.{IMASK,ENABLE} ?
> > > >
> > > > No, there are no issue if interrupts are masked.
> > >
> > > If a write to CNTV_CTL_EL0.IMASK races with the interrupt being
> > > asserted, can that trigger the problem?
> >
> > If interrupt is enabled (DAIF) - then it will be taken, and no issue
> > But if interrupts are disabled then following sequence can see the race
> > 1) interrupt is disabled (DAIF)
> > 2) TVAL/ENABLE/IMASK at timer h/w programming to de-assert interrupt.
> > Race of Irq asserted before irq de-asserted, than this short window of
> assertion will be considered as spike from timer h/w block
> > 3) Enable DAIF
> > Because of propagation delay CPU sees assertion and de-assertion
> > (spike), errata hit
> >
> > Will add "isb" around interrupt enablement in next version of patch.
>
> ... why?
>
> That doesn't seem to follow from the abvoe, so I think I'm missing a step.
>
> > > If a write to DAIF.I races with the interrupt being asserted, can
> > > that trigger the problem?
> >
> > No race with writing to DAIF.I with interrupt assertion, Writing
> > DAIF.I = 0 (enablement of interrupt) can race with de-assertion, which
> > can lead to hitting errata
>
> Ok.
>
> That *might* make it possible to bodge around the timer specifically, but as
> below I don't think we can ensure this is safe in the presence of virtualization,
> nor when considering other interrupts.
>
> > > From your description so far, this doesn't sound like it is specific
> > > to the timer interrupt. Is it possible for a different interrupt to trigger this, e.g:
> > >
> > > * Can the same happen with another PPI, e.g. the PMU interrupt if that
> > > gets de-asserted, or there's a race with DAIF.I?
> > >
> > > * Can the same happen with an SGI, e.g. if one CPU asserts then
> > > de-asserts an SGI targetting another CPU, or there's a race with
> > > DAIF.I?
> > >
> > > * Can the same happen with an SPI, e.g. if a device asserts then
> > > de-asserts its IRQ line, or there's a race with DAIF.I?
> >
> > No issue with edge triggered, but this can happen with any level sensitive
> interrupt.
>
> Ok.
>
> So that'll include at least the PMU and
>
> > > If not, *why* does this happen specifically for the timer interrupt?
> > >
> > > > > > Workaround of this is to ensure maximum 2us of time gap
> > > > > > between timer interrupt and timer programming which can de-assert
> timer interrupt.
> > > > >
> > > > > The code below seems to try to enforce a 2us *minimum*. Which is
> > > > > it supposed to be?
> > > >
> > > > Yes, it is minimum 2us.
> > > >
> > > > >
> > > > > Can you explain *why* this is supposed to help?
> > > > With the workaround interrupt assertion and de-assertion will be
> > > > minimum 2us
> > > apart.
> > >
> > > I understood that, but why is that deemed to be sufficient? e.g. is
> > > it somehow guaranteed that the CPU will complete the instruction that would
> "become nop"
> > > in that time?
> >
> > With this delay we avoid spike, either this this will becomes an
> > actual interrupt or the spike never visible to core.
> >
> > > > > I don't see how we can guarantee this in a VM, or if the CPU
> > > > > misses on an instruction fetch.
> > > >
> > > > This errata applies to VM (virtual timer) as well, maybe there is
> > > > some gap in my understanding, how it will be different in VM.
> > > > Can you help with what issue we can have VM?
> > >
> > > A VCPU can be pre-empted by the host at *any* time, for an arbitrary
> > > length of time. So e.g. you can have a scenario such as:
> > >
> > > 1. Guest reads CNTx_TVAL, sees interrupt is 4us in the future and
> > > decides it does not need to wait
> > > 2. Host preempts guest
> > > 3. Host does some processing for ~3.9us 4. Host returns to guest,
> > > with 0.1us left until the interrupt triggers 5. Guest reprograms
> > > CNTx_TVAL, and triggers the erratum
> >
> > Yes, when timer expire just before tval written (race condition) , so
> > there is assertion-followed by de-assertion, As interrupts are enabled
> > in host, interrupt will be visible as spike to host.
>
> Ok, so that's a recipe for the guest to attack the host.
>
> > We will apply workaround whenever entering to guest (add a delay
> > before exiting to guest in case guest timer is going to expire).
>
> I think this is papering over the problem.
>
> You've said this can happen for *any* level-triggered interrupt. AFAIK, nothing
> prevents a malicious guest from deliberately asserting and de-asserting a level-
> triggered interrupt (e.g. by writing to the GIC distributor), and I also note that the
> GIC maintenance interrupt is level-triggered.
>
> So, as above:
>
> 1) A guest can deliberately cause information to be leaked to itself via
> the corrupted GPRs. I haven't seen any rationale for why that is not
> a problem, nor have I seen a suggested workaround.
>
> 2) A guest *may* be able to trigger this while the host is running. I
> haven't seen anything that rules this out so far.
>
> 3) Even in the absence of virtualization, it would be necessary to
> workaround this for *every* level-triggered interrupt, which includes
> at the timer, PMU, and GIC maintenance interrupts, in addition to any
> other configurable PPIs or SPIs.
>
> Without a fix that covers all of those, I don't think the workaround is viable.

This patch covers workaround for ARM arch timer in non-virtualized cases.

While we are considering different scenarios which can trigger the issue.
After discussing with HW folks internally we have come to a conclusion that there is no
single workaround which will fix all the scenarios. The host timer interrupt workaround is
different from virtualization and from other interrupt sources.

While we are working on other workarounds, we want to push timer workaround first
as currently that's the one customers are encountering right now and want a upstream
accepted patch soon. Other workarounds will take time to test and qualify.

Wrt drivers disabling the interrupt, except changing the driver, we don't see any common
place where we can add a workaround. Please let me your take on this.

Thanks
-Bharat

>
> Thanks,
> Mark.

2021-07-26 18:05:47

by Marc Zyngier

[permalink] [raw]
Subject: Re: [EXT] Re: [PATCH] clocksource: Add Marvell Errata-38627 workaround

Hi Bharat,

On Mon, 26 Jul 2021 05:29:53 +0100,
Bharat Bhushan <[email protected]> wrote:
>
> Sorry for delayed response
>
> Please see inline
>
> > -----Original Message-----
> > From: Mark Rutland <[email protected]>
> > Sent: Tuesday, July 13, 2021 9:43 PM
> >
> > 1) A guest can deliberately cause information to be leaked to itself via
> > the corrupted GPRs. I haven't seen any rationale for why that is not
> > a problem, nor have I seen a suggested workaround.
> >
> > 2) A guest *may* be able to trigger this while the host is running. I
> > haven't seen anything that rules this out so far.
> >
> > 3) Even in the absence of virtualization, it would be necessary to
> > workaround this for *every* level-triggered interrupt, which includes
> > at the timer, PMU, and GIC maintenance interrupts, in addition to any
> > other configurable PPIs or SPIs.
> >
> > Without a fix that covers all of those, I don't think the
> > workaround is viable.
>
> This patch covers workaround for ARM arch timer in non-virtualized
> cases.
>
> While we are considering different scenarios which can trigger the
> issue. After discussing with HW folks internally we have come to a
> conclusion that there is no single workaround which will fix all the
> scenarios. The host timer interrupt workaround is different from
> virtualization and from other interrupt sources.
>
> While we are working on other workarounds, we want to push timer
> workaround first as currently that's the one customers are
> encountering right now and want a upstream accepted patch
> soon. Other workarounds will take time to test and qualify.
>
> Wrt drivers disabling the interrupt, except changing the driver, we
> don't see any common place where we can add a workaround. Please let
> me your take on this.

I don't think a workaround limited to the timer is viable. It is quite
obvious that once you have worked around the most likely cause for a
crash (timer interrupts), you will need to come up with yet another
workaround for another interrupt source.

We need a solution that works for all interrupts, or at the very least
all per-CPU interrupts. For global interrupts, only you can find out
how they can be mitigated. If that means changing drivers, so be it.
I understand that this isn't what you want to read, but I'm not
confident taking this patch with the knowledge that there is still a
million ways to make it fall over.

Evidently, KVM cannot be enabled on such a system. More importantly, I
cannot see how we can support users of such a machine either. How to
analyse a crash report if there is a remote possibility that the CPU
has decided to ignore a number of instructions?

To sum it up, I'm not prepared to approve such a patch until there is
a compelling story for all the interrupts that may trigger such
behaviour.

Thanks,

M.

--
Without deviation from the norm, progress is not possible.