2020-09-01 14:45:42

by Marc Zyngier

[permalink] [raw]
Subject: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Change the way we deal with GIC SGIs by turning them into proper
IRQs, and calling into the arch code to register the interrupt range
instead of a callback.

Reviewed-by: Valentin Schneider <[email protected]>
Signed-off-by: Marc Zyngier <[email protected]>
---
drivers/irqchip/irq-gic.c | 103 +++++++++++++++++++++++---------------
1 file changed, 63 insertions(+), 40 deletions(-)

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 4ffd62af888f..4be2b62f816f 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -315,7 +315,7 @@ static int gic_set_type(struct irq_data *d, unsigned int type)
static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
/* Only interrupts on the primary GIC can be forwarded to a vcpu. */
- if (cascading_gic_irq(d))
+ if (cascading_gic_irq(d) || gic_irq(d) < 16)
return -EINVAL;

if (vcpu)
@@ -335,31 +335,22 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
irqstat = readl_relaxed(cpu_base + GIC_CPU_INTACK);
irqnr = irqstat & GICC_IAR_INT_ID_MASK;

- if (likely(irqnr > 15 && irqnr < 1020)) {
- if (static_branch_likely(&supports_deactivate_key))
- writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
- isb();
- handle_domain_irq(gic->domain, irqnr, regs);
- continue;
- }
- if (irqnr < 16) {
+ if (unlikely(irqnr >= 1020))
+ break;
+
+ if (static_branch_likely(&supports_deactivate_key))
writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
- if (static_branch_likely(&supports_deactivate_key))
- writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
-#ifdef CONFIG_SMP
- /*
- * Ensure any shared data written by the CPU sending
- * the IPI is read after we've read the ACK register
- * on the GIC.
- *
- * Pairs with the write barrier in gic_raise_softirq
- */
+ isb();
+
+ /*
+ * Ensure any shared data written by the CPU sending the IPI
+ * is read after we've read the ACK register on the GIC.
+ *
+ * Pairs with the write barrier in gic_ipi_send_mask
+ */
+ if (irqnr <= 15)
smp_rmb();
- handle_IPI(irqnr, regs);
-#endif
- continue;
- }
- break;
+ handle_domain_irq(gic->domain, irqnr, regs);
} while (1);
}

@@ -793,14 +784,14 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
return IRQ_SET_MASK_OK_DONE;
}

-static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
+static void gic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
{
int cpu;
unsigned long flags, map = 0;

if (unlikely(nr_cpu_ids == 1)) {
/* Only one CPU? let's do a self-IPI... */
- writel_relaxed(2 << 24 | irq,
+ writel_relaxed(2 << 24 | d->hwirq,
gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
return;
}
@@ -818,7 +809,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
dmb(ishst);

/* this always happens on GIC0 */
- writel_relaxed(map << 16 | irq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);
+ writel_relaxed(map << 16 | d->hwirq, gic_data_dist_base(&gic_data[0]) + GIC_DIST_SOFTINT);

gic_unlock_irqrestore(flags);
}
@@ -831,14 +822,28 @@ static int gic_starting_cpu(unsigned int cpu)

static __init void gic_smp_init(void)
{
- set_smp_cross_call(gic_raise_softirq);
+ struct irq_fwspec sgi_fwspec = {
+ .fwnode = gic_data[0].domain->fwnode,
+ .param_count = 1,
+ };
+ int base_sgi;
+
cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_GIC_STARTING,
"irqchip/arm/gic:starting",
gic_starting_cpu, NULL);
+
+ base_sgi = __irq_domain_alloc_irqs(gic_data[0].domain, -1, 8,
+ NUMA_NO_NODE, &sgi_fwspec,
+ false, NULL);
+ if (WARN_ON(base_sgi <= 0))
+ return;
+
+ set_smp_ipi_range(base_sgi, 8);
}
#else
#define gic_smp_init() do { } while(0)
#define gic_set_affinity NULL
+#define gic_ipi_send_mask NULL
#endif

#ifdef CONFIG_BL_SWITCHER
@@ -985,15 +990,24 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
{
struct gic_chip_data *gic = d->host_data;

- if (hw < 32) {
+ switch (hw) {
+ case 0 ... 15:
+ irq_set_percpu_devid(irq);
+ irq_domain_set_info(d, irq, hw, &gic->chip, d->host_data,
+ handle_percpu_devid_fasteoi_ipi,
+ NULL, NULL);
+ break;
+ case 16 ... 31:
irq_set_percpu_devid(irq);
irq_domain_set_info(d, irq, hw, &gic->chip, d->host_data,
handle_percpu_devid_irq, NULL, NULL);
- } else {
+ break;
+ default:
irq_domain_set_info(d, irq, hw, &gic->chip, d->host_data,
handle_fasteoi_irq, NULL, NULL);
irq_set_probe(irq);
irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(irq)));
+ break;
}
return 0;
}
@@ -1007,19 +1021,26 @@ static int gic_irq_domain_translate(struct irq_domain *d,
unsigned long *hwirq,
unsigned int *type)
{
+ if (fwspec->param_count == 1 && fwspec->param[0] < 16) {
+ *hwirq = fwspec->param[0];
+ *type = IRQ_TYPE_EDGE_RISING;
+ return 0;
+ }
+
if (is_of_node(fwspec->fwnode)) {
if (fwspec->param_count < 3)
return -EINVAL;

- /* Get the interrupt number and add 16 to skip over SGIs */
- *hwirq = fwspec->param[1] + 16;
-
- /*
- * For SPIs, we need to add 16 more to get the GIC irq
- * ID number
- */
- if (!fwspec->param[0])
- *hwirq += 16;
+ switch (fwspec->param[0]) {
+ case 0: /* SPI */
+ *hwirq = fwspec->param[1] + 32;
+ break;
+ case 1: /* PPI */
+ *hwirq = fwspec->param[1] + 16;
+ break;
+ default:
+ return -EINVAL;
+ }

*type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;

@@ -1088,8 +1109,10 @@ static void gic_init_chip(struct gic_chip_data *gic, struct device *dev,
gic->chip.irq_set_vcpu_affinity = gic_irq_set_vcpu_affinity;
}

- if (gic == &gic_data[0])
+ if (gic == &gic_data[0]) {
gic->chip.irq_set_affinity = gic_set_affinity;
+ gic->chip.ipi_send_mask = gic_ipi_send_mask;
+ }
}

static int gic_init_bases(struct gic_chip_data *gic,
--
2.27.0


2020-09-14 13:22:08

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marek,

On 2020-09-14 14:06, Marek Szyprowski wrote:
> Hi Marc,
>
> On 01.09.2020 16:43, Marc Zyngier wrote:
>> Change the way we deal with GIC SGIs by turning them into proper
>> IRQs, and calling into the arch code to register the interrupt range
>> instead of a callback.
>>
>> Reviewed-by: Valentin Schneider <[email protected]>
>> Signed-off-by: Marc Zyngier <[email protected]>
> This patch landed in linux next-20200914 as commit ac063232d4b0
> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it breaks
> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
> Cortex A9 based). Here are the last lines from the bootlog:
>
> [    0.106322] CPU: Testing write buffer coherency: ok
> [    0.109895] CPU0: Spectre v2: using BPIALL workaround
> [    0.116057] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
> [    0.123885] Setting up static identity map for 0x40100000 -
> 0x40100060
> [    0.130191] rcu: Hierarchical SRCU implementation.
> [    0.137195] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
> REV[0x11] Detected
> [    0.145129] smp: Bringing up secondary CPUs ...
> [    0.156279] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
> [    0.156291] CPU1: Spectre v2: using BPIALL workaround
> [    2.716379] random: fast init done

Thanks for the report. Is this the funky non-banked GIC?

M.
--
Jazz is not dead. It just smells funny...

2020-09-14 13:38:57

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

On 14.09.2020 15:13, Marc Zyngier wrote:
> On 2020-09-14 14:06, Marek Szyprowski wrote:
>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>> Change the way we deal with GIC SGIs by turning them into proper
>>> IRQs, and calling into the arch code to register the interrupt range
>>> instead of a callback.
>>>
>>> Reviewed-by: Valentin Schneider <[email protected]>
>>> Signed-off-by: Marc Zyngier <[email protected]>
>> This patch landed in linux next-20200914 as commit ac063232d4b0
>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it breaks
>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>> Cortex A9 based). Here are the last lines from the bootlog:
>>
>> [    0.106322] CPU: Testing write buffer coherency: ok
>> [    0.109895] CPU0: Spectre v2: using BPIALL workaround
>> [    0.116057] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
>> [    0.123885] Setting up static identity map for 0x40100000 -
>> 0x40100060
>> [    0.130191] rcu: Hierarchical SRCU implementation.
>> [    0.137195] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
>> REV[0x11] Detected
>> [    0.145129] smp: Bringing up secondary CPUs ...
>> [    0.156279] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
>> [    0.156291] CPU1: Spectre v2: using BPIALL workaround
>> [    2.716379] random: fast init done
>
> Thanks for the report. Is this the funky non-banked GIC?

Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume that
the GIC registers are not banked.

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-14 14:28:32

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

On 01.09.2020 16:43, Marc Zyngier wrote:
> Change the way we deal with GIC SGIs by turning them into proper
> IRQs, and calling into the arch code to register the interrupt range
> instead of a callback.
>
> Reviewed-by: Valentin Schneider <[email protected]>
> Signed-off-by: Marc Zyngier <[email protected]>
This patch landed in linux next-20200914 as commit ac063232d4b0
("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it breaks
booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
Cortex A9 based). Here are the last lines from the bootlog:

[    0.106322] CPU: Testing write buffer coherency: ok
[    0.109895] CPU0: Spectre v2: using BPIALL workaround
[    0.116057] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
[    0.123885] Setting up static identity map for 0x40100000 - 0x40100060
[    0.130191] rcu: Hierarchical SRCU implementation.
[    0.137195] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
REV[0x11] Detected
[    0.145129] smp: Bringing up secondary CPUs ...
[    0.156279] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
[    0.156291] CPU1: Spectre v2: using BPIALL workaround
[    2.716379] random: fast init done

> ---
> drivers/irqchip/irq-gic.c | 103 +++++++++++++++++++++++---------------
> 1 file changed, 63 insertions(+), 40 deletions(-)
>
> ...

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-14 16:50:57

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Marek,

On 2020-09-14 14:26, Marek Szyprowski wrote:
> Hi Marc,
>
> On 14.09.2020 15:13, Marc Zyngier wrote:
>> On 2020-09-14 14:06, Marek Szyprowski wrote:
>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>> IRQs, and calling into the arch code to register the interrupt range
>>>> instead of a callback.
>>>>
>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>> breaks
>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>
>>> [    0.106322] CPU: Testing write buffer coherency: ok
>>> [    0.109895] CPU0: Spectre v2: using BPIALL workaround
>>> [    0.116057] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
>>> [    0.123885] Setting up static identity map for 0x40100000 -
>>> 0x40100060
>>> [    0.130191] rcu: Hierarchical SRCU implementation.
>>> [    0.137195] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
>>> REV[0x11] Detected
>>> [    0.145129] smp: Bringing up secondary CPUs ...
>>> [    0.156279] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
>>> [    0.156291] CPU1: Spectre v2: using BPIALL workaround
>>> [    2.716379] random: fast init done
>>
>> Thanks for the report. Is this the funky non-banked GIC?
>
> Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
> device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume that
> the GIC registers are not banked.

Annoyingly, it seems to work correctly in QEMU:

root@unassigned-hostname:~# cat /proc/interrupts
CPU0 CPU1
40: 0 0 GIC-0 89 Level mct_comp_irq
41: 16144 0 GIC-0 74 Level mct_tick0
42: 0 15205 GIC-0 80 Level mct_tick1
43: 0 0 COMBINER 18 Edge arm-pmu
44: 0 0 COMBINER 26 Edge arm-pmu
46: 2270 0 GIC-0 107 Level mmc0
48: 878 0 GIC-0 84 Level 13800000.serial
52: 0 0 GIC-0 90 Level 13860000.i2c
54: 0 0 GIC-0 67 Level 12680000.pdma
55: 0 0 GIC-0 68 Level 12690000.pdma
56: 0 0 GIC-0 66 Level 12850000.mdma
59: 0 0 COMBINER 45 Edge 13620000.sysmmu
60: 0 0 COMBINER 46 Edge 13630000.sysmmu
61: 0 0 COMBINER 44 Edge 12e20000.sysmmu
62: 0 0 COMBINER 34 Edge 11a20000.sysmmu
63: 0 0 COMBINER 35 Edge 11a30000.sysmmu
64: 0 0 COMBINER 36 Edge 11a40000.sysmmu
65: 0 0 COMBINER 37 Edge 11a50000.sysmmu
66: 0 0 COMBINER 38 Edge 11a60000.sysmmu
67: 0 0 COMBINER 40 Edge 12a30000.sysmmu
68: 0 0 COMBINER 42 Edge 11e20000.sysmmu
74: 0 0 GIC-0 79 Level 11400000.pinctrl
75: 0 0 GIC-0 78 Level 11000000.pinctrl
77: 0 0 COMBINER 39 Edge 12a20000.sysmmu
78: 0 0 COMBINER 43 Edge 12220000.sysmmu
IPI0: 0 1 CPU wakeup interrupts
IPI1: 0 0 Timer broadcast interrupts
IPI2: 32 63 Rescheduling interrupts
IPI3: 3925 5381 Function call interrupts
IPI4: 0 0 CPU stop interrupts
IPI5: 4375 3778 IRQ work interrupts
IPI6: 0 0 completion interrupts
Err: 0

Do you happen to know whether the QEMU emulation is trustworthy?

Thanks,

M.
--
Jazz is not dead. It just smells funny...

2020-09-15 06:51:30

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

On 14.09.2020 17:09, Marc Zyngier wrote:
> On 2020-09-14 14:26, Marek Szyprowski wrote:
>> On 14.09.2020 15:13, Marc Zyngier wrote:
>>> On 2020-09-14 14:06, Marek Szyprowski wrote:
>>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>>> IRQs, and calling into the arch code to register the interrupt range
>>>>> instead of a callback.
>>>>>
>>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>>> breaks
>>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>>
>>>> [    0.106322] CPU: Testing write buffer coherency: ok
>>>> [    0.109895] CPU0: Spectre v2: using BPIALL workaround
>>>> [    0.116057] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
>>>> [    0.123885] Setting up static identity map for 0x40100000 -
>>>> 0x40100060
>>>> [    0.130191] rcu: Hierarchical SRCU implementation.
>>>> [    0.137195] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
>>>> REV[0x11] Detected
>>>> [    0.145129] smp: Bringing up secondary CPUs ...
>>>> [    0.156279] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
>>>> [    0.156291] CPU1: Spectre v2: using BPIALL workaround
>>>> [    2.716379] random: fast init done
>>>
>>> Thanks for the report. Is this the funky non-banked GIC?
>>
>> Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
>> device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume that
>> the GIC registers are not banked.
>
> Annoyingly, it seems to work correctly in QEMU:
>
> root@unassigned-hostname:~# cat /proc/interrupts
>            CPU0       CPU1
>  40:          0          0     GIC-0  89 Level     mct_comp_irq
>  41:      16144          0     GIC-0  74 Level     mct_tick0
>  42:          0      15205     GIC-0  80 Level     mct_tick1
>  43:          0          0  COMBINER  18 Edge      arm-pmu
>  44:          0          0  COMBINER  26 Edge      arm-pmu
>  46:       2270          0     GIC-0 107 Level     mmc0
>  48:        878          0     GIC-0  84 Level     13800000.serial
>  52:          0          0     GIC-0  90 Level     13860000.i2c
>  54:          0          0     GIC-0  67 Level     12680000.pdma
>  55:          0          0     GIC-0  68 Level     12690000.pdma
>  56:          0          0     GIC-0  66 Level     12850000.mdma
>  59:          0          0  COMBINER  45 Edge      13620000.sysmmu
>  60:          0          0  COMBINER  46 Edge      13630000.sysmmu
>  61:          0          0  COMBINER  44 Edge      12e20000.sysmmu
>  62:          0          0  COMBINER  34 Edge      11a20000.sysmmu
>  63:          0          0  COMBINER  35 Edge      11a30000.sysmmu
>  64:          0          0  COMBINER  36 Edge      11a40000.sysmmu
>  65:          0          0  COMBINER  37 Edge      11a50000.sysmmu
>  66:          0          0  COMBINER  38 Edge      11a60000.sysmmu
>  67:          0          0  COMBINER  40 Edge      12a30000.sysmmu
>  68:          0          0  COMBINER  42 Edge      11e20000.sysmmu
>  74:          0          0     GIC-0  79 Level 11400000.pinctrl
>  75:          0          0     GIC-0  78 Level 11000000.pinctrl
>  77:          0          0  COMBINER  39 Edge      12a20000.sysmmu
>  78:          0          0  COMBINER  43 Edge      12220000.sysmmu
> IPI0:          0          1  CPU wakeup interrupts
> IPI1:          0          0  Timer broadcast interrupts
> IPI2:         32         63  Rescheduling interrupts
> IPI3:       3925       5381  Function call interrupts
> IPI4:          0          0  CPU stop interrupts
> IPI5:       4375       3778  IRQ work interrupts
> IPI6:          0          0  completion interrupts
> Err:          0
>
> Do you happen to know whether the QEMU emulation is trustworthy?

I didn't play much with Exynos emulation on QEMU. All I know is that
this patch simply doesn't work on the real hw.

If there is anything to check or test, let me know. I will try to help
as much as possible.

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-15 08:17:16

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marek,

On 2020-09-15 07:48, Marek Szyprowski wrote:
> Hi Marc,
>
>>> Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
>>> device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume
>>> that
>>> the GIC registers are not banked.
>>
>> Annoyingly, it seems to work correctly in QEMU:

[...]

>> Do you happen to know whether the QEMU emulation is trustworthy?
>
> I didn't play much with Exynos emulation on QEMU. All I know is that
> this patch simply doesn't work on the real hw.

I don't doubt it. The question was more whether we could trust QEMU
to be reliable, in which case the issue would be around a kernel
configuration problem. Could you stash your kernel config somewhere?

> If there is anything to check or test, let me know. I will try to help
> as much as possible.

It would be interesting to see whether the CPUs are getting any IPI.
Can you try the following patch, and send the results back?

Thanks,

M.

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 00327fa74b01..5b01d53de9af 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -420,7 +420,7 @@ asmlinkage void secondary_start_kernel(void)
#ifndef CONFIG_MMU
setup_vectors_base();
#endif
- pr_debug("CPU%u: Booted secondary processor\n", cpu);
+ pr_err("CPU%u: Booted secondary processor\n", cpu);

preempt_disable();
trace_hardirqs_off();
@@ -621,6 +621,8 @@ static void do_handle_IPI(int ipinr)
{
unsigned int cpu = smp_processor_id();

+ pr_info("CPU%d IPI%d received\n", cpu, ipinr);
+
if ((unsigned)ipinr < NR_IPI)
trace_ipi_entry_rcuidle(ipi_types[ipinr]);

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index d7321ccf730f..7723cad6e406 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -190,6 +190,8 @@ static inline bool cascading_gic_irq(struct irq_data
*d)
static void gic_poke_irq(struct irq_data *d, u32 offset)
{
u32 mask = 1 << (gic_irq(d) % 32);
+ if (gic_irq(d) < 16)
+ pr_info("CPU%d IPI%lu base = %lx\n", smp_processor_id(), d->hwirq,
(unsigned long)gic_dist_base(d));
writel_relaxed(mask, gic_dist_base(d) + offset + (gic_irq(d) / 32) *
4);
}

@@ -814,6 +816,7 @@ static void gic_ipi_send_mask(struct irq_data *d,
const struct cpumask *mask)
*/
dmb(ishst);

+ pr_info("CPU%d send IPI%lu base = %lx\n", smp_processor_id(),
d->hwirq, (unsigned long)gic_data_dist_base(&gic_data[0]));
/* this always happens on GIC0 */
writel_relaxed(map << 16 | d->hwirq, gic_data_dist_base(&gic_data[0])
+ GIC_DIST_SOFTINT);


--
Jazz is not dead. It just smells funny...

2020-09-15 08:36:39

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

On 15.09.2020 10:07, Marc Zyngier wrote:
> On 2020-09-15 07:48, Marek Szyprowski wrote:
>>>> Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
>>>> device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume
>>>> that
>>>> the GIC registers are not banked.
>>>
>>> Annoyingly, it seems to work correctly in QEMU:
>
> [...]
>
>>> Do you happen to know whether the QEMU emulation is trustworthy?
>>
>> I didn't play much with Exynos emulation on QEMU. All I know is that
>> this patch simply doesn't work on the real hw.
>
> I don't doubt it. The question was more whether we could trust QEMU
> to be reliable, in which case the issue would be around a kernel
> configuration problem. Could you stash your kernel config somewhere?

I just use the vanilla exynos_defconfig for my tests.

>> If there is anything to check or test, let me know. I will try to help
>> as much as possible.
>
> It would be interesting to see whether the CPUs are getting any IPI.
> Can you try the following patch, and send the results back?

Starting kernel ...

[    0.000000] Booting Linux on physical CPU 0x900
[    0.000000] Linux version 5.9.0-rc4-00008-gac063232d4b0-dirty
(mszyprow@AMDC2765) (arm-linux-gnueabi-gcc (Linaro GCC 4.9-2017.01)
4.9.4, GNU ld (Linaro_Binutils-2017.01) 2.24.0.20141017 Linaro
2014_11-3-git) #9174 SMP PREEMPT Tue Sep 15 10:30:46 CEST 2020
[    0.000000] CPU: ARMv7 Processor [412fc091] revision 1 (ARMv7),
cr=10c5387d
[    0.000000] CPU: PIPT / VIPT nonaliasing data cache, VIPT aliasing
instruction cache
[    0.000000] OF: fdt: Machine model: Samsung Trats based on Exynos4210
[    0.000000] earlycon: exynos4210 at MMIO 0x13820000 (options '115200n8')
[    0.000000] printk: bootconsole [exynos4210] enabled
[    0.000000] Memory policy: Data cache writealloc
[    0.000000] cma: Reserved 96 MiB at 0x7a000000
[    0.000000] Samsung CPU ID: 0x43210211
[    0.000000] Zone ranges:
[    0.000000]   Normal   [mem 0x0000000040000000-0x000000006fffffff]
[    0.000000]   HighMem  [mem 0x0000000070000000-0x000000007fffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000040000000-0x000000007fffffff]
[    0.000000] Initmem setup node 0 [mem
0x0000000040000000-0x000000007fffffff]
[    0.000000] percpu: Embedded 20 pages/cpu s51904 r8192 d21824 u81920
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 260608
[    0.000000] Kernel command line: root=PARTLABEL=data rootwait
console=tty1 console=ttySAC2,115200n8 earlycon
[    0.000000] Dentry cache hash table entries: 131072 (order: 7, 524288
bytes, linear)
[    0.000000] Inode-cache hash table entries: 65536 (order: 6, 262144
bytes, linear)
[    0.000000] mem auto-init: stack:off, heap alloc:off, heap free:off
[    0.000000] Memory: 917072K/1048576K available (10240K kernel code,
958K rwdata, 4000K rodata, 1024K init, 6487K bss, 33200K reserved,
98304K cma-reserved, 163840K highmem)
[    0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
[    0.000000] Running RCU self tests
[    0.000000] rcu: Preemptible hierarchical RCU implementation.
[    0.000000] rcu:     RCU event tracing is enabled.
[    0.000000] rcu:     RCU lockdep checking is enabled.
[    0.000000] rcu:     RCU restricting CPUs from NR_CPUS=8 to nr_cpu_ids=2.
[    0.000000]  Trampoline variant of Tasks RCU enabled.
[    0.000000] rcu: RCU calculated value of scheduler-enlistment delay
is 10 jiffies.
[    0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2
[    0.000000] NR_IRQS: 16, nr_irqs: 16, preallocated irqs: 16
[    0.000000] CPU0 IPI0 base = f0800000
[    0.000000] CPU0 IPI1 base = f0800000
[    0.000000] CPU0 IPI2 base = f0800000
[    0.000000] CPU0 IPI3 base = f0800000
[    0.000000] CPU0 IPI4 base = f0800000
[    0.000000] CPU0 IPI5 base = f0800000
[    0.000000] CPU0 IPI6 base = f0800000
[    0.000000] CPU0 IPI7 base = f0800000
[    0.000000] L2C: platform modifies aux control register: 0x02070000
-> 0x3e470000
[    0.000000] L2C: DT/platform modifies aux control register:
0x02070000 -> 0x3e470000
[    0.000000] L2C-310 enabling early BRESP for Cortex-A9
[    0.000000] L2C-310 full line of zeros enabled for Cortex-A9
[    0.000000] L2C-310 dynamic clock gating enabled, standby mode enabled
[    0.000000] L2C-310 cache controller enabled, 16 ways, 1024 kB
[    0.000000] L2C-310: CACHE_ID 0x4100c4c5, AUX_CTRL 0x4e470001
[    0.000000] random: get_random_bytes called from
start_kernel+0x4c0/0x67c with crng_init=0
[    0.000000] Exynos4210 clocks: sclk_apll = 800000000, sclk_mpll =
800000000
[    0.000000]  sclk_epll = 96000000, sclk_vpll = 108000000, arm_clk =
800000000
[    0.000000] Switching to timer-based delay loop, resolution 41ns
[    0.000000] clocksource: mct-frc: mask: 0xffffffff max_cycles:
0xffffffff, max_idle_ns: 79635851949 ns
[    0.000007] sched_clock: 32 bits at 24MHz, resolution 41ns, wraps
every 89478484971ns
[    0.008850] Console: colour dummy device 80x30
[    0.017306] printk: console [tty1] enabled
[    0.019992] Lock dependency validator: Copyright (c) 2006 Red Hat,
Inc., Ingo Molnar
[    0.027791] ... MAX_LOCKDEP_SUBCLASSES:  8
[    0.031793] ... MAX_LOCK_DEPTH:          48
[    0.035959] ... MAX_LOCKDEP_KEYS:        8192
[    0.040327] ... CLASSHASH_SIZE:          4096
[    0.044643] ... MAX_LOCKDEP_ENTRIES:     32768
[    0.049096] ... MAX_LOCKDEP_CHAINS:      65536
[    0.053497] ... CHAINHASH_SIZE:          32768
[    0.057948]  memory used by lock dependency info: 4029 kB
[    0.063318]  memory used for stack traces: 2112 kB
[    0.068110]  per task-struct memory footprint: 1536 bytes
[    0.073536] Calibrating delay loop (skipped), value calculated using
timer frequency.. 48.00 BogoMIPS (lpj=240000)
[    0.083901] pid_max: default: 32768 minimum: 301
[    0.088789] Mount-cache hash table entries: 2048 (order: 1, 8192
bytes, linear)
[    0.095733] Mountpoint-cache hash table entries: 2048 (order: 1, 8192
bytes, linear)
[    0.106706] CPU: Testing write buffer coherency: ok
[    0.110276] CPU0: Spectre v2: using BPIALL workaround
[    0.116474] CPU0: thread -1, cpu 0, socket 9, mpidr 80000900
[    0.124225] Setting up static identity map for 0x40100000 - 0x40100060
[    0.130539] rcu: Hierarchical SRCU implementation.
[    0.137562] soc soc0: Exynos: CPU[EXYNOS4210] PRO_ID[0x43210211]
REV[0x11] Detected
[    0.145493] smp: Bringing up secondary CPUs ...
[    0.152740] CPU0 send IPI0 base = f0800000
[    0.152786] CPU1: Booted secondary processor
[    0.155582] CPU0 send IPI0 base = f0800000
[    0.163945] CPU1 IPI0 base = f0808000
[    0.163956] CPU1 IPI1 base = f0808000
[    0.163966] CPU1 IPI2 base = f0808000
[    0.163976] CPU1 IPI3 base = f0808000
[    0.163986] CPU1 IPI4 base = f0808000
[    0.163995] CPU1 IPI5 base = f0808000
[    0.164004] CPU1 IPI6 base = f0808000
[    0.164014] CPU1 IPI7 base = f0808000
[    0.164025] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
[    0.164035] CPU1: Spectre v2: using BPIALL workaround
[    0.203803] CPU1 send IPI2 base = f0808000
[    0.207834] CPU1 IPI0 received
[    0.207839] CPU0 IPI2 received
[    0.214052] CPU0 send IPI2 base = f0800000
[    0.217990] CPU1 IPI2 received
[    0.222188] CPU1 send IPI2 base = f0808000
[    2.754062] random: fast init done

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-15 09:52:03

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-15 09:35, Marek Szyprowski wrote:
> Hi Marc,
>
> On 15.09.2020 10:07, Marc Zyngier wrote:
>> On 2020-09-15 07:48, Marek Szyprowski wrote:
>>>>> Both Exynos 4210 and 4412 use non-zero cpu-offset in GIC node in
>>>>> device-tree: arch/arm/boot/dts/exynos{4210,4412}.dtsi, so I assume
>>>>> that
>>>>> the GIC registers are not banked.
>>>>
>>>> Annoyingly, it seems to work correctly in QEMU:
>>
>> [...]
>>
>>>> Do you happen to know whether the QEMU emulation is trustworthy?
>>>
>>> I didn't play much with Exynos emulation on QEMU. All I know is that
>>> this patch simply doesn't work on the real hw.
>>
>> I don't doubt it. The question was more whether we could trust QEMU
>> to be reliable, in which case the issue would be around a kernel
>> configuration problem. Could you stash your kernel config somewhere?
>
> I just use the vanilla exynos_defconfig for my tests.

Tried that with QEMU, same result. It keeps working. Oh well.

>
>>> If there is anything to check or test, let me know. I will try to
>>> help
>>> as much as possible.
>>
>> It would be interesting to see whether the CPUs are getting any IPI.
>> Can you try the following patch, and send the results back?

[...]

> [    0.145493] smp: Bringing up secondary CPUs ...
> [    0.152740] CPU0 send IPI0 base = f0800000
> [    0.152786] CPU1: Booted secondary processor
> [    0.155582] CPU0 send IPI0 base = f0800000
> [    0.163945] CPU1 IPI0 base = f0808000
> [    0.163956] CPU1 IPI1 base = f0808000
> [    0.163966] CPU1 IPI2 base = f0808000
> [    0.163976] CPU1 IPI3 base = f0808000
> [    0.163986] CPU1 IPI4 base = f0808000
> [    0.163995] CPU1 IPI5 base = f0808000
> [    0.164004] CPU1 IPI6 base = f0808000
> [    0.164014] CPU1 IPI7 base = f0808000
> [    0.164025] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
> [    0.164035] CPU1: Spectre v2: using BPIALL workaround
> [    0.203803] CPU1 send IPI2 base = f0808000
> [    0.207834] CPU1 IPI0 received
> [    0.207839] CPU0 IPI2 received
> [    0.214052] CPU0 send IPI2 base = f0800000
> [    0.217990] CPU1 IPI2 received
> [    0.222188] CPU1 send IPI2 base = f0808000
> [    2.754062] random: fast init done

So IPIs *do work* for some time, but CPU0 ends up not seeing IPI2.
I see a slightly different behaviour in QEMU:

[ 0.555590] smp: Bringing up secondary CPUs ...
[ 0.606032] CPU0 send IPI0 base = f0800000
[ 0.609149] CPU0 send IPI0 base = f0800000
[ 0.610329] CPU0 send IPI0 base = f0800000
[ 0.611445] CPU0 send IPI0 base = f0800000
[ 0.611588] CPU1: Booted secondary processor
[ 0.613579] CPU0 send IPI0 base = f0800000
[ 0.616180] CPU1 IPI0 base = f0808000
[ 0.616470] CPU1 IPI1 base = f0808000
[ 0.616634] CPU1 IPI2 base = f0808000
[ 0.616781] CPU1 IPI3 base = f0808000
[ 0.616931] CPU1 IPI4 base = f0808000
[ 0.617074] CPU1 IPI5 base = f0808000
[ 0.617220] CPU1 IPI6 base = f0808000
[ 0.617366] CPU1 IPI7 base = f0808000
[ 0.617824] CPU1: thread -1, cpu 1, socket 9, mpidr 80000901
[ 0.618115] CPU1: Spectre v2: using BPIALL workaround
[ 0.627969] CPU1 send IPI3 base = f0808000
[ 0.631301] CPU0 IPI3 received
[ 0.631389] CPU1 IPI0 received
[ 0.639726] CPU0 send IPI2 base = f0800000
[ 0.641632] CPU1 IPI2 received
[ 0.664666] CPU1 send IPI2 base = f0808000
[ 0.665987] CPU0 IPI2 received
[ 0.670718] smp: Brought up 1 node, 2 CPUs
[ 0.672175] SMP: Total of 2 processors activated (48.00 BogoMIPS).
[ 0.674071] CPU: All CPU(s) started in SVC mode.

where the secondary starts by sending IPI3 (IPI_CALL_FUNC). Not sure it
matters.

The fact that CPU0 doesn't process the second IPI2 makes me wonder
if there is something flawed in the EOI logic.

Can you try applying this patch, which reverts that particular logic?
If that happens to work, we'll have to investigate what comes out
of the IAR register...

Otherwise, we'll keep reverting bits of the patch until we nail it...

Thanks,

M.

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 4be2b62f816f..6daf2de7233a 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -335,22 +335,31 @@ static void __exception_irq_entry
gic_handle_irq(struct pt_regs *regs)
irqstat = readl_relaxed(cpu_base + GIC_CPU_INTACK);
irqnr = irqstat & GICC_IAR_INT_ID_MASK;

- if (unlikely(irqnr >= 1020))
- break;
-
- if (static_branch_likely(&supports_deactivate_key))
+ if (likely(irqnr > 15 && irqnr < 1020)) {
+ if (static_branch_likely(&supports_deactivate_key))
+ writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
+ isb();
+ handle_domain_irq(gic->domain, irqnr, regs);
+ continue;
+ }
+ if (irqnr < 16) {
writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
- isb();
-
- /*
- * Ensure any shared data written by the CPU sending the IPI
- * is read after we've read the ACK register on the GIC.
- *
- * Pairs with the write barrier in gic_ipi_send_mask
- */
- if (irqnr <= 15)
+ if (static_branch_likely(&supports_deactivate_key))
+ writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
+#ifdef CONFIG_SMP
+ /*
+ * Ensure any shared data written by the CPU sending
+ * the IPI is read after we've read the ACK register
+ * on the GIC.
+ *
+ * Pairs with the write barrier in gic_raise_softirq
+ */
smp_rmb();
- handle_domain_irq(gic->domain, irqnr, regs);
+ handle_IPI(irqnr, regs);
+#endif
+ continue;
+ }
+ break;
} while (1);
}


--
Jazz is not dead. It just smells funny...

2020-09-16 16:01:04

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-16 16:46, Jon Hunter wrote:
> On 16/09/2020 16:10, Marc Zyngier wrote:
>> Hi Jon,
>>
>> +Linus, who is facing a similar issue.
>>
>> On 2020-09-16 15:16, Jon Hunter wrote:
>>> Hi Marc,
>>>
>>> On 14/09/2020 14:06, Marek Szyprowski wrote:
>>>> Hi Marc,
>>>>
>>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>>> IRQs, and calling into the arch code to register the interrupt
>>>>> range
>>>>> instead of a callback.
>>>>>
>>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>>> breaks
>>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>
>>> I am observing the same thing on several Tegra boards (both arm and
>>> arm64). Bisect is pointing to this commit. Reverting this alone does
>>> not
>>> appear to be enough to fix the issue.
>>
>> Right, I am just massively by the GICv3 spec, and failed to remember
>> that ye olde GIC exposes the source CPU in AIR *and* wants it back,
>> while
>> newer GICs deal with that transparently.
>>
>> Can you try the patch below and let me know?
>
> Yes will do.
>
>> @@ -365,14 +354,13 @@ static void __exception_irq_entry
>> gic_handle_irq(struct pt_regs *regs)
>>              smp_rmb();
>>
>>              /*
>> -             * Samsung's funky GIC encodes the source CPU in
>> -             * GICC_IAR, leading to the deactivation to fail if
>> -             * not written back as is to GICC_EOI.  Stash the
>> -             * INTID away for gic_eoi_irq() to write back.
>> -             * This only works because we don't nest SGIs...
>> +             * The GIC encodes the source CPU in GICC_IAR,
>> +             * leading to the deactivation to fail if not
>> +             * written back as is to GICC_EOI.  Stash the INTID
>> +             * away for gic_eoi_irq() to write back.  This only
>> +             * works because we don't nest SGIs...
>>               */
>> -            if (is_frankengic())
>> -                set_sgi_intid(irqstat);
>> +            this_cpu_write(sgi_intid, intid);
>
> I assume that it should be irqstat here and not intid?

Indeed. As you can tell, I haven't even tried to compile it, sorry about
that.

M.
--
Jazz is not dead. It just smells funny...

2020-09-16 16:43:25

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts



On 16/09/2020 16:10, Marc Zyngier wrote:
> Hi Jon,
>
> +Linus, who is facing a similar issue.
>
> On 2020-09-16 15:16, Jon Hunter wrote:
>> Hi Marc,
>>
>> On 14/09/2020 14:06, Marek Szyprowski wrote:
>>> Hi Marc,
>>>
>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>> IRQs, and calling into the arch code to register the interrupt range
>>>> instead of a callback.
>>>>
>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it breaks
>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>>> Cortex A9 based). Here are the last lines from the bootlog:
>>
>> I am observing the same thing on several Tegra boards (both arm and
>> arm64). Bisect is pointing to this commit. Reverting this alone does not
>> appear to be enough to fix the issue.
>
> Right, I am just massively by the GICv3 spec, and failed to remember
> that ye olde GIC exposes the source CPU in AIR *and* wants it back, while
> newer GICs deal with that transparently.
>
> Can you try the patch below and let me know?

Yes will do.

> @@ -365,14 +354,13 @@ static void __exception_irq_entry
> gic_handle_irq(struct pt_regs *regs)
>              smp_rmb();
>
>              /*
> -             * Samsung's funky GIC encodes the source CPU in
> -             * GICC_IAR, leading to the deactivation to fail if
> -             * not written back as is to GICC_EOI.  Stash the
> -             * INTID away for gic_eoi_irq() to write back.
> -             * This only works because we don't nest SGIs...
> +             * The GIC encodes the source CPU in GICC_IAR,
> +             * leading to the deactivation to fail if not
> +             * written back as is to GICC_EOI.  Stash the INTID
> +             * away for gic_eoi_irq() to write back.  This only
> +             * works because we don't nest SGIs...
>               */
> -            if (is_frankengic())
> -                set_sgi_intid(irqstat);
> +            this_cpu_write(sgi_intid, intid);

I assume that it should be irqstat here and not intid?

Cheers
Jon

--
nvpublic

2020-09-16 16:46:21

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

On 14/09/2020 14:06, Marek Szyprowski wrote:
> Hi Marc,
>
> On 01.09.2020 16:43, Marc Zyngier wrote:
>> Change the way we deal with GIC SGIs by turning them into proper
>> IRQs, and calling into the arch code to register the interrupt range
>> instead of a callback.
>>
>> Reviewed-by: Valentin Schneider <[email protected]>
>> Signed-off-by: Marc Zyngier <[email protected]>
> This patch landed in linux next-20200914 as commit ac063232d4b0
> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it breaks
> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
> Cortex A9 based). Here are the last lines from the bootlog:

I am observing the same thing on several Tegra boards (both arm and
arm64). Bisect is pointing to this commit. Reverting this alone does not
appear to be enough to fix the issue.

Cheers
Jon

--
nvpublic

2020-09-16 19:08:09

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 16/09/2020 17:22, Marc Zyngier wrote:
> Do you boot form EL2?

Not that I am aware of. There is no hypervisor that we are using.

Jon

--
nvpublic

2020-09-16 19:09:32

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 16/09/2020 17:28, Marc Zyngier wrote:

...

> Make it that instead:
>
>  static void gic_eoimode1_eoi_irq(struct irq_data *d)
>  {
> +    u32 hwirq = gic_irq(d);
> +
>      /* Do not deactivate an IRQ forwarded to a vcpu. */
>      if (irqd_is_forwarded_to_vcpu(d))
>          return;
>
> -    writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
> +    if (hwirq < 16)
> +        hwirq = this_cpu_read(sgi_intid);
> +
> +    writel_relaxed(hwirq, gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
>  }


Unfortunately, still does not boot :-(

Jon

--
nvpublic

2020-09-16 19:13:00

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Jon,

+Linus, who is facing a similar issue.

On 2020-09-16 15:16, Jon Hunter wrote:
> Hi Marc,
>
> On 14/09/2020 14:06, Marek Szyprowski wrote:
>> Hi Marc,
>>
>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>> Change the way we deal with GIC SGIs by turning them into proper
>>> IRQs, and calling into the arch code to register the interrupt range
>>> instead of a callback.
>>>
>>> Reviewed-by: Valentin Schneider <[email protected]>
>>> Signed-off-by: Marc Zyngier <[email protected]>
>> This patch landed in linux next-20200914 as commit ac063232d4b0
>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>> breaks
>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>> Cortex A9 based). Here are the last lines from the bootlog:
>
> I am observing the same thing on several Tegra boards (both arm and
> arm64). Bisect is pointing to this commit. Reverting this alone does
> not
> appear to be enough to fix the issue.

Right, I am just massively by the GICv3 spec, and failed to remember
that ye olde GIC exposes the source CPU in AIR *and* wants it back,
while
newer GICs deal with that transparently.

Can you try the patch below and let me know?

M.

diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 98743afdaea6..56492bf8b6f9 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -121,9 +121,10 @@ static struct gic_chip_data
gic_data[CONFIG_ARM_GIC_MAX_NR] __read_mostly;

static struct gic_kvm_info gic_v2_kvm_info;

+static DEFINE_PER_CPU(u32, sgi_intid);
+
#ifdef CONFIG_GIC_NON_BANKED
static DEFINE_STATIC_KEY_FALSE(frankengic_key);
-static DEFINE_PER_CPU(u32, sgi_intid);

static void enable_frankengic(void)
{
@@ -135,16 +136,6 @@ static inline bool is_frankengic(void)
return static_branch_unlikely(&frankengic_key);
}

-static inline void set_sgi_intid(u32 intid)
-{
- this_cpu_write(sgi_intid, intid);
-}
-
-static inline u32 get_sgi_intid(void)
-{
- return this_cpu_read(sgi_intid);
-}
-
static inline void __iomem *__get_base(union gic_base *base)
{
if (is_frankengic())
@@ -160,8 +151,6 @@ static inline void __iomem *__get_base(union
gic_base *base)
#define gic_data_cpu_base(d) ((d)->cpu_base.common_base)
#define enable_frankengic() do { } while(0)
#define is_frankengic() false
-#define set_sgi_intid(i) do { } while(0)
-#define get_sgi_intid() 0
#endif

static inline void __iomem *gic_dist_base(struct irq_data *d)
@@ -236,8 +225,8 @@ static void gic_eoi_irq(struct irq_data *d)
{
u32 hwirq = gic_irq(d);

- if (is_frankengic() && hwirq < 16)
- hwirq = get_sgi_intid();
+ if (hwirq < 16)
+ hwirq = this_cpu_read(sgi_intid);

writel_relaxed(hwirq, gic_cpu_base(d) + GIC_CPU_EOI);
}
@@ -365,14 +354,13 @@ static void __exception_irq_entry
gic_handle_irq(struct pt_regs *regs)
smp_rmb();

/*
- * Samsung's funky GIC encodes the source CPU in
- * GICC_IAR, leading to the deactivation to fail if
- * not written back as is to GICC_EOI. Stash the
- * INTID away for gic_eoi_irq() to write back.
- * This only works because we don't nest SGIs...
+ * The GIC encodes the source CPU in GICC_IAR,
+ * leading to the deactivation to fail if not
+ * written back as is to GICC_EOI. Stash the INTID
+ * away for gic_eoi_irq() to write back. This only
+ * works because we don't nest SGIs...
*/
- if (is_frankengic())
- set_sgi_intid(irqstat);
+ this_cpu_write(sgi_intid, intid);
}

handle_domain_irq(gic->domain, irqnr, regs);

--
Jazz is not dead. It just smells funny...

2020-09-16 19:43:16

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 16/09/2020 20:26, Mikko Perttunen wrote:
> Not sure which boards this issue is happening on, but looking at my
> hobby kernel's git history (from a couple of years ago, memory is a bit
> hazy), the commit labeled "Add support for TX2" adds code to drop from
> EL2 to EL1 at boot.

I am seeing boot issues on Tegra20, Tegra30, Tegra186 and Tegra194.
Interestingly, Tegra124 and Tegra210 are booting OK.

Jon

--
nvpublic

2020-09-16 20:13:17

by Mikko Perttunen

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Not sure which boards this issue is happening on, but looking at my
hobby kernel's git history (from a couple of years ago, memory is a bit
hazy), the commit labeled "Add support for TX2" adds code to drop from
EL2 to EL1 at boot.

Mikko

On 9/16/20 10:06 PM, Jon Hunter wrote:
> On 16/09/2020 17:22, Marc Zyngier wrote:
>> Do you boot form EL2?
>
> Not that I am aware of. There is no hypervisor that we are using.
>
> Jon
>

2020-09-16 20:28:43

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 16/09/2020 16:55, Marc Zyngier wrote:
> On 2020-09-16 16:46, Jon Hunter wrote:
>> On 16/09/2020 16:10, Marc Zyngier wrote:
>>> Hi Jon,
>>>
>>> +Linus, who is facing a similar issue.
>>>
>>> On 2020-09-16 15:16, Jon Hunter wrote:
>>>> Hi Marc,
>>>>
>>>> On 14/09/2020 14:06, Marek Szyprowski wrote:
>>>>> Hi Marc,
>>>>>
>>>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>>>> IRQs, and calling into the arch code to register the interrupt range
>>>>>> instead of a callback.
>>>>>>
>>>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>>>> breaks
>>>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad ARM
>>>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>>
>>>> I am observing the same thing on several Tegra boards (both arm and
>>>> arm64). Bisect is pointing to this commit. Reverting this alone does
>>>> not
>>>> appear to be enough to fix the issue.
>>>
>>> Right, I am just massively by the GICv3 spec, and failed to remember
>>> that ye olde GIC exposes the source CPU in AIR *and* wants it back,
>>> while
>>> newer GICs deal with that transparently.
>>>
>>> Can you try the patch below and let me know?
>>
>> Yes will do.
>>
>>> @@ -365,14 +354,13 @@ static void __exception_irq_entry
>>> gic_handle_irq(struct pt_regs *regs)
>>>              smp_rmb();
>>>
>>>              /*
>>> -             * Samsung's funky GIC encodes the source CPU in
>>> -             * GICC_IAR, leading to the deactivation to fail if
>>> -             * not written back as is to GICC_EOI.  Stash the
>>> -             * INTID away for gic_eoi_irq() to write back.
>>> -             * This only works because we don't nest SGIs...
>>> +             * The GIC encodes the source CPU in GICC_IAR,
>>> +             * leading to the deactivation to fail if not
>>> +             * written back as is to GICC_EOI.  Stash the INTID
>>> +             * away for gic_eoi_irq() to write back.  This only
>>> +             * works because we don't nest SGIs...
>>>               */
>>> -            if (is_frankengic())
>>> -                set_sgi_intid(irqstat);
>>> +            this_cpu_write(sgi_intid, intid);
>>
>> I assume that it should be irqstat here and not intid?
>
> Indeed. As you can tell, I haven't even tried to compile it, sorry about
> that.

No worries, I got the gist. However, even with this change, it still
does not boot :-(

Jon

--
nvpublic

2020-09-16 20:45:33

by Linus Walleij

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On Tue, Sep 1, 2020 at 4:44 PM Marc Zyngier <[email protected]> wrote:

> Change the way we deal with GIC SGIs by turning them into proper
> IRQs, and calling into the arch code to register the interrupt range
> instead of a callback.
>
> Reviewed-by: Valentin Schneider <[email protected]>
> Signed-off-by: Marc Zyngier <[email protected]>

Hmmm apart from Exynos this crashes the Ux500 too... I don't
even get the crash dumpon a LL UART, it hard hangs.

Yours,
Linus Walleij

2020-09-16 20:48:01

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-16 16:58, Jon Hunter wrote:
> On 16/09/2020 16:55, Marc Zyngier wrote:
>> On 2020-09-16 16:46, Jon Hunter wrote:
>>> On 16/09/2020 16:10, Marc Zyngier wrote:
>>>> Hi Jon,
>>>>
>>>> +Linus, who is facing a similar issue.
>>>>
>>>> On 2020-09-16 15:16, Jon Hunter wrote:
>>>>> Hi Marc,
>>>>>
>>>>> On 14/09/2020 14:06, Marek Szyprowski wrote:
>>>>>> Hi Marc,
>>>>>>
>>>>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>>>>> IRQs, and calling into the arch code to register the interrupt
>>>>>>> range
>>>>>>> instead of a callback.
>>>>>>>
>>>>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>>>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>>>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>>>>> breaks
>>>>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad
>>>>>> ARM
>>>>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>>>
>>>>> I am observing the same thing on several Tegra boards (both arm and
>>>>> arm64). Bisect is pointing to this commit. Reverting this alone
>>>>> does
>>>>> not
>>>>> appear to be enough to fix the issue.
>>>>
>>>> Right, I am just massively by the GICv3 spec, and failed to remember
>>>> that ye olde GIC exposes the source CPU in AIR *and* wants it back,
>>>> while
>>>> newer GICs deal with that transparently.
>>>>
>>>> Can you try the patch below and let me know?
>>>
>>> Yes will do.
>>>
>>>> @@ -365,14 +354,13 @@ static void __exception_irq_entry
>>>> gic_handle_irq(struct pt_regs *regs)
>>>>              smp_rmb();
>>>>
>>>>              /*
>>>> -             * Samsung's funky GIC encodes the source CPU in
>>>> -             * GICC_IAR, leading to the deactivation to fail if
>>>> -             * not written back as is to GICC_EOI.  Stash the
>>>> -             * INTID away for gic_eoi_irq() to write back.
>>>> -             * This only works because we don't nest SGIs...
>>>> +             * The GIC encodes the source CPU in GICC_IAR,
>>>> +             * leading to the deactivation to fail if not
>>>> +             * written back as is to GICC_EOI.  Stash the INTID
>>>> +             * away for gic_eoi_irq() to write back.  This only
>>>> +             * works because we don't nest SGIs...
>>>>               */
>>>> -            if (is_frankengic())
>>>> -                set_sgi_intid(irqstat);
>>>> +            this_cpu_write(sgi_intid, intid);
>>>
>>> I assume that it should be irqstat here and not intid?
>>
>> Indeed. As you can tell, I haven't even tried to compile it, sorry
>> about
>> that.
>
> No worries, I got the gist. However, even with this change, it still
> does not boot :-(

Do you boot form EL2? If so, you'd also need this:

static void gic_eoimode1_eoi_irq(struct irq_data *d)
{
+ u32 hwirq = gic_irq(d);
+
/* Do not deactivate an IRQ forwarded to a vcpu. */
if (irqd_is_forwarded_to_vcpu(d))
return;

+ if (hwirq < 16)
+ hwirq = this_cpu_read(sgi_intid);
+
writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
}

If none of that works, we'll need some additional traces. On the other
hand, I just booted this on a GICv2-based system, and it worked fine...

M.
--
Jazz is not dead. It just smells funny...

2020-09-16 20:48:24

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-16 17:22, Marc Zyngier wrote:
> On 2020-09-16 16:58, Jon Hunter wrote:
>> On 16/09/2020 16:55, Marc Zyngier wrote:
>>> On 2020-09-16 16:46, Jon Hunter wrote:
>>>> On 16/09/2020 16:10, Marc Zyngier wrote:
>>>>> Hi Jon,
>>>>>
>>>>> +Linus, who is facing a similar issue.
>>>>>
>>>>> On 2020-09-16 15:16, Jon Hunter wrote:
>>>>>> Hi Marc,
>>>>>>
>>>>>> On 14/09/2020 14:06, Marek Szyprowski wrote:
>>>>>>> Hi Marc,
>>>>>>>
>>>>>>> On 01.09.2020 16:43, Marc Zyngier wrote:
>>>>>>>> Change the way we deal with GIC SGIs by turning them into proper
>>>>>>>> IRQs, and calling into the arch code to register the interrupt
>>>>>>>> range
>>>>>>>> instead of a callback.
>>>>>>>>
>>>>>>>> Reviewed-by: Valentin Schneider <[email protected]>
>>>>>>>> Signed-off-by: Marc Zyngier <[email protected]>
>>>>>>> This patch landed in linux next-20200914 as commit ac063232d4b0
>>>>>>> ("irqchip/gic: Configure SGIs as standard interrupts"). Sadly it
>>>>>>> breaks
>>>>>>> booting of all Samsung Exynos 4210/4412 based boards (dual/quad
>>>>>>> ARM
>>>>>>> Cortex A9 based). Here are the last lines from the bootlog:
>>>>>>
>>>>>> I am observing the same thing on several Tegra boards (both arm
>>>>>> and
>>>>>> arm64). Bisect is pointing to this commit. Reverting this alone
>>>>>> does
>>>>>> not
>>>>>> appear to be enough to fix the issue.
>>>>>
>>>>> Right, I am just massively by the GICv3 spec, and failed to
>>>>> remember
>>>>> that ye olde GIC exposes the source CPU in AIR *and* wants it back,
>>>>> while
>>>>> newer GICs deal with that transparently.
>>>>>
>>>>> Can you try the patch below and let me know?
>>>>
>>>> Yes will do.
>>>>
>>>>> @@ -365,14 +354,13 @@ static void __exception_irq_entry
>>>>> gic_handle_irq(struct pt_regs *regs)
>>>>>              smp_rmb();
>>>>>
>>>>>              /*
>>>>> -             * Samsung's funky GIC encodes the source CPU in
>>>>> -             * GICC_IAR, leading to the deactivation to fail if
>>>>> -             * not written back as is to GICC_EOI.  Stash the
>>>>> -             * INTID away for gic_eoi_irq() to write back.
>>>>> -             * This only works because we don't nest SGIs...
>>>>> +             * The GIC encodes the source CPU in GICC_IAR,
>>>>> +             * leading to the deactivation to fail if not
>>>>> +             * written back as is to GICC_EOI.  Stash the INTID
>>>>> +             * away for gic_eoi_irq() to write back.  This only
>>>>> +             * works because we don't nest SGIs...
>>>>>               */
>>>>> -            if (is_frankengic())
>>>>> -                set_sgi_intid(irqstat);
>>>>> +            this_cpu_write(sgi_intid, intid);
>>>>
>>>> I assume that it should be irqstat here and not intid?
>>>
>>> Indeed. As you can tell, I haven't even tried to compile it, sorry
>>> about
>>> that.
>>
>> No worries, I got the gist. However, even with this change, it still
>> does not boot :-(
>
> Do you boot form EL2? If so, you'd also need this:
>
> static void gic_eoimode1_eoi_irq(struct irq_data *d)
> {
> + u32 hwirq = gic_irq(d);
> +
> /* Do not deactivate an IRQ forwarded to a vcpu. */
> if (irqd_is_forwarded_to_vcpu(d))
> return;
>
> + if (hwirq < 16)
> + hwirq = this_cpu_read(sgi_intid);
> +
> writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
> }

Make it that instead:

static void gic_eoimode1_eoi_irq(struct irq_data *d)
{
+ u32 hwirq = gic_irq(d);
+
/* Do not deactivate an IRQ forwarded to a vcpu. */
if (irqd_is_forwarded_to_vcpu(d))
return;

- writel_relaxed(gic_irq(d), gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
+ if (hwirq < 16)
+ hwirq = this_cpu_read(sgi_intid);
+
+ writel_relaxed(hwirq, gic_cpu_base(d) + GIC_CPU_DEACTIVATE);
}


M.
--
Jazz is not dead. It just smells funny...

2020-09-16 21:02:01

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Linus,

On 2020-09-16 15:03, Linus Walleij wrote:
> On Tue, Sep 1, 2020 at 4:44 PM Marc Zyngier <[email protected]> wrote:
>
>> Change the way we deal with GIC SGIs by turning them into proper
>> IRQs, and calling into the arch code to register the interrupt range
>> instead of a callback.
>>
>> Reviewed-by: Valentin Schneider <[email protected]>
>> Signed-off-by: Marc Zyngier <[email protected]>
>
> Hmmm apart from Exynos this crashes the Ux500 too... I don't
> even get the crash dumpon a LL UART, it hard hangs.

No output whatsoever? That's really odd... Or do you see the kernel
booting and locking up when starting secondaries? If the latter, it
would indicate that we haven't properly deactivated the SGI...

M.
--
Jazz is not dead. It just smells funny...

2020-09-17 07:44:46

by Linus Walleij

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:

> Can you try the patch below and let me know?

I tried this patch and now Ux500 WORKS. So this patch is definitely
something you should apply.

> - if (is_frankengic())
> - set_sgi_intid(irqstat);
> + this_cpu_write(sgi_intid, intid);

This needs changing to irqstat to compile as pointed out by Jon.

With that:
Tested-by: Linus Walleij <[email protected]>

Yours,
Linus Walleij

2020-09-17 07:55:50

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Linus,

On 2020-09-17 08:40, Linus Walleij wrote:
> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>
>> Can you try the patch below and let me know?
>
> I tried this patch and now Ux500 WORKS. So this patch is definitely
> something you should apply.
>
>> - if (is_frankengic())
>> - set_sgi_intid(irqstat);
>> + this_cpu_write(sgi_intid, intid);
>
> This needs changing to irqstat to compile as pointed out by Jon.
>
> With that:
> Tested-by: Linus Walleij <[email protected]>

Thanks a lot for that.

Still need to understand why some of Jon's systems are left unbootable,
despite having similar GIC implementations (Tegra194 and Tegra210 use
the same GIC-400, and yet only one of the two boots correctly...).

Thanks again,

M.
--
Jazz is not dead. It just smells funny...

2020-09-17 08:04:40

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 17/09/2020 08:50, Marc Zyngier wrote:
> Hi Linus,
>
> On 2020-09-17 08:40, Linus Walleij wrote:
>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>
>>> Can you try the patch below and let me know?
>>
>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>> something you should apply.
>>
>>> -                       if (is_frankengic())
>>> -                               set_sgi_intid(irqstat);
>>> +                       this_cpu_write(sgi_intid, intid);
>>
>> This needs changing to irqstat to compile as pointed out by Jon.
>>
>> With that:
>> Tested-by: Linus Walleij <[email protected]>
>
> Thanks a lot for that.
>
> Still need to understand why some of Jon's systems are left unbootable,
> despite having similar GIC implementations (Tegra194 and Tegra210 use
> the same GIC-400, and yet only one of the two boots correctly...).

So far, I have only tested this patch on Tegra20. Let me try the other
failing boards this morning and see if those still fail.

Cheers
Jon

--
nvpublic

2020-09-17 08:49:01

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-17 08:54, Jon Hunter wrote:
> On 17/09/2020 08:50, Marc Zyngier wrote:
>> Hi Linus,
>>
>> On 2020-09-17 08:40, Linus Walleij wrote:
>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>
>>>> Can you try the patch below and let me know?
>>>
>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>> something you should apply.
>>>
>>>> -                       if (is_frankengic())
>>>> -                               set_sgi_intid(irqstat);
>>>> +                       this_cpu_write(sgi_intid, intid);
>>>
>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>
>>> With that:
>>> Tested-by: Linus Walleij <[email protected]>
>>
>> Thanks a lot for that.
>>
>> Still need to understand why some of Jon's systems are left
>> unbootable,
>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>> the same GIC-400, and yet only one of the two boots correctly...).
>
> So far, I have only tested this patch on Tegra20. Let me try the other
> failing boards this morning and see if those still fail.

Tegra20 (if I remember well) is a dual A9 with the same GIC
implementation
as Ux500, hence requiring the source CPU bits to be written back. So
this
patch should have cured it, but didn't...

/me puzzled.

M.
--
Jazz is not dead. It just smells funny...

2020-09-17 08:53:34

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 17/09/2020 09:45, Marc Zyngier wrote:
> On 2020-09-17 08:54, Jon Hunter wrote:
>> On 17/09/2020 08:50, Marc Zyngier wrote:
>>> Hi Linus,
>>>
>>> On 2020-09-17 08:40, Linus Walleij wrote:
>>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>>
>>>>> Can you try the patch below and let me know?
>>>>
>>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>>> something you should apply.
>>>>
>>>>> -                       if (is_frankengic())
>>>>> -                               set_sgi_intid(irqstat);
>>>>> +                       this_cpu_write(sgi_intid, intid);
>>>>
>>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>>
>>>> With that:
>>>> Tested-by: Linus Walleij <[email protected]>
>>>
>>> Thanks a lot for that.
>>>
>>> Still need to understand why some of Jon's systems are left unbootable,
>>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>>> the same GIC-400, and yet only one of the two boots correctly...).
>>
>> So far, I have only tested this patch on Tegra20. Let me try the other
>> failing boards this morning and see if those still fail.
>
> Tegra20 (if I remember well) is a dual A9 with the same GIC implementation
> as Ux500, hence requiring the source CPU bits to be written back. So this
> patch should have cured it, but didn't...
>
> /me puzzled.

Me too. Maybe there just happens to be something else also going wrong
in next. I am doing a bit more testing to see if applying the fix
directly on top of this change fixes it to try and eliminate anything
else in -next.

Linus, what -next are you testing on? I am using next-20200916.

Jon

--
nvpublic

2020-09-17 08:57:35

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-17 09:49, Jon Hunter wrote:
> On 17/09/2020 09:45, Marc Zyngier wrote:
>> On 2020-09-17 08:54, Jon Hunter wrote:

>>> So far, I have only tested this patch on Tegra20. Let me try the
>>> other
>>> failing boards this morning and see if those still fail.
>>
>> Tegra20 (if I remember well) is a dual A9 with the same GIC
>> implementation
>> as Ux500, hence requiring the source CPU bits to be written back. So
>> this
>> patch should have cured it, but didn't...
>>
>> /me puzzled.
>
> Me too. Maybe there just happens to be something else also going wrong
> in next. I am doing a bit more testing to see if applying the fix
> directly on top of this change fixes it to try and eliminate anything
> else in -next.
>
> Linus, what -next are you testing on? I am using next-20200916.

You can directly try [1], which has all the queued fixes (and only
that).

M.

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq
--
Jazz is not dead. It just smells funny...

2020-09-17 09:07:46

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Jon,

On 17.09.2020 10:49, Jon Hunter wrote:
> On 17/09/2020 09:45, Marc Zyngier wrote:
>> On 2020-09-17 08:54, Jon Hunter wrote:
>>> On 17/09/2020 08:50, Marc Zyngier wrote:
>>>> On 2020-09-17 08:40, Linus Walleij wrote:
>>>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>>>
>>>>>> Can you try the patch below and let me know?
>>>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>>>> something you should apply.
>>>>>
>>>>>> -                       if (is_frankengic())
>>>>>> -                               set_sgi_intid(irqstat);
>>>>>> +                       this_cpu_write(sgi_intid, intid);
>>>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>>>
>>>>> With that:
>>>>> Tested-by: Linus Walleij <[email protected]>
>>>> Thanks a lot for that.
>>>>
>>>> Still need to understand why some of Jon's systems are left unbootable,
>>>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>>>> the same GIC-400, and yet only one of the two boots correctly...).
>>> So far, I have only tested this patch on Tegra20. Let me try the other
>>> failing boards this morning and see if those still fail.
>> Tegra20 (if I remember well) is a dual A9 with the same GIC implementation
>> as Ux500, hence requiring the source CPU bits to be written back. So this
>> patch should have cured it, but didn't...
>>
>> /me puzzled.
> Me too. Maybe there just happens to be something else also going wrong
> in next. I am doing a bit more testing to see if applying the fix
> directly on top of this change fixes it to try and eliminate anything
> else in -next.
>
> Linus, what -next are you testing on? I am using next-20200916.

next-20200916 completely broken on ARM and ARM64. Please check
next-20200915 + the mentioned fix or just check
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-17 09:12:33

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 17/09/2020 09:54, Marek Szyprowski wrote:
> Hi Jon,
>
> On 17.09.2020 10:49, Jon Hunter wrote:
>> On 17/09/2020 09:45, Marc Zyngier wrote:
>>> On 2020-09-17 08:54, Jon Hunter wrote:
>>>> On 17/09/2020 08:50, Marc Zyngier wrote:
>>>>> On 2020-09-17 08:40, Linus Walleij wrote:
>>>>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>>>>
>>>>>>> Can you try the patch below and let me know?
>>>>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>>>>> something you should apply.
>>>>>>
>>>>>>> -                       if (is_frankengic())
>>>>>>> -                               set_sgi_intid(irqstat);
>>>>>>> +                       this_cpu_write(sgi_intid, intid);
>>>>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>>>>
>>>>>> With that:
>>>>>> Tested-by: Linus Walleij <[email protected]>
>>>>> Thanks a lot for that.
>>>>>
>>>>> Still need to understand why some of Jon's systems are left unbootable,
>>>>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>>>>> the same GIC-400, and yet only one of the two boots correctly...).
>>>> So far, I have only tested this patch on Tegra20. Let me try the other
>>>> failing boards this morning and see if those still fail.
>>> Tegra20 (if I remember well) is a dual A9 with the same GIC implementation
>>> as Ux500, hence requiring the source CPU bits to be written back. So this
>>> patch should have cured it, but didn't...
>>>
>>> /me puzzled.
>> Me too. Maybe there just happens to be something else also going wrong
>> in next. I am doing a bit more testing to see if applying the fix
>> directly on top of this change fixes it to try and eliminate anything
>> else in -next.
>>
>> Linus, what -next are you testing on? I am using next-20200916.
>
> next-20200916 completely broken on ARM and ARM64. Please check
> next-20200915 + the mentioned fix or just check
> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq

Ah thanks! Any idea what is causing the other failure on next-20200916?

Yes we have noticed that now everything fails next-20200916 so not just
this issue.

Cheers
Jon

--
nvpublic

2020-09-17 09:15:36

by Marek Szyprowski

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Jon,

On 17.09.2020 11:09, Jon Hunter wrote:
> On 17/09/2020 09:54, Marek Szyprowski wrote:
>> On 17.09.2020 10:49, Jon Hunter wrote:
>>> On 17/09/2020 09:45, Marc Zyngier wrote:
>>>> On 2020-09-17 08:54, Jon Hunter wrote:
>>>>> On 17/09/2020 08:50, Marc Zyngier wrote:
>>>>>> On 2020-09-17 08:40, Linus Walleij wrote:
>>>>>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>>>>>
>>>>>>>> Can you try the patch below and let me know?
>>>>>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>>>>>> something you should apply.
>>>>>>>
>>>>>>>> -                       if (is_frankengic())
>>>>>>>> -                               set_sgi_intid(irqstat);
>>>>>>>> +                       this_cpu_write(sgi_intid, intid);
>>>>>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>>>>>
>>>>>>> With that:
>>>>>>> Tested-by: Linus Walleij <[email protected]>
>>>>>> Thanks a lot for that.
>>>>>>
>>>>>> Still need to understand why some of Jon's systems are left unbootable,
>>>>>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>>>>>> the same GIC-400, and yet only one of the two boots correctly...).
>>>>> So far, I have only tested this patch on Tegra20. Let me try the other
>>>>> failing boards this morning and see if those still fail.
>>>> Tegra20 (if I remember well) is a dual A9 with the same GIC implementation
>>>> as Ux500, hence requiring the source CPU bits to be written back. So this
>>>> patch should have cured it, but didn't...
>>>>
>>>> /me puzzled.
>>> Me too. Maybe there just happens to be something else also going wrong
>>> in next. I am doing a bit more testing to see if applying the fix
>>> directly on top of this change fixes it to try and eliminate anything
>>> else in -next.
>>>
>>> Linus, what -next are you testing on? I am using next-20200916.
>> next-20200916 completely broken on ARM and ARM64. Please check
>> next-20200915 + the mentioned fix or just check
>> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq
> Ah thanks! Any idea what is causing the other failure on next-20200916?
>
> Yes we have noticed that now everything fails next-20200916 so not just
> this issue.

The issue is caused by commit c999bd436fe9 ("mm/cma: make number of CMA
areas dynamic, remove CONFIG_CMA_AREAS")

https://lore.kernel.org/linux-arm-kernel/[email protected]/

Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland

2020-09-17 09:30:47

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-17 10:13, Marek Szyprowski wrote:

[...]

>>>> Linus, what -next are you testing on? I am using next-20200916.
>>> next-20200916 completely broken on ARM and ARM64. Please check
>>> next-20200915 + the mentioned fix or just check
>>> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq
>> Ah thanks! Any idea what is causing the other failure on
>> next-20200916?
>>
>> Yes we have noticed that now everything fails next-20200916 so not
>> just
>> this issue.
>
> The issue is caused by commit c999bd436fe9 ("mm/cma: make number of CMA
> areas dynamic, remove CONFIG_CMA_AREAS")
>
> https://lore.kernel.org/linux-arm-kernel/[email protected]/

There is a workaround here[1] for arm64, but I doubt that's the end of
it (32bit is still dead).

M.

[1]
https://lore.kernel.org/linux-arm-kernel/[email protected]/
--
Jazz is not dead. It just smells funny...

2020-09-17 10:13:49

by Linus Walleij

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On Thu, Sep 17, 2020 at 10:49 AM Jon Hunter <[email protected]> wrote:

> Linus, what -next are you testing on? I am using next-20200916.

That's what I use. But the Ux500 graphics are simple and does not
use CMA and that is why I don't see this crash (I assume).

Yours,
Linus Walleij

2020-09-17 18:29:30

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 17/09/2020 15:53, Jon Hunter wrote:

...

>> next-20200916 completely broken on ARM and ARM64. Please check
>> next-20200915 + the mentioned fix or just check
>> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq
>
> OK, I have confirmed that on Tegra20 and Tegra30, that next-20200915 +
> Marc's fix boots fine.
>
> Tegra186 and Tegra194 are not booting, but I am wondering if this is yet
> another issue that is not related. I have not actually bisected on these
> boards, but I am now bisecting on Tegra186 to confirm.

Hmm ... well bisect on Tegra186 is also pointing to this commit, but the
fix is not working there. Wonder what's going on with this gic-400?

Jon

--
nvpublic

2020-09-17 19:59:24

by Jon Hunter

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts


On 17/09/2020 09:54, Marek Szyprowski wrote:
> Hi Jon,
>
> On 17.09.2020 10:49, Jon Hunter wrote:
>> On 17/09/2020 09:45, Marc Zyngier wrote:
>>> On 2020-09-17 08:54, Jon Hunter wrote:
>>>> On 17/09/2020 08:50, Marc Zyngier wrote:
>>>>> On 2020-09-17 08:40, Linus Walleij wrote:
>>>>>> On Wed, Sep 16, 2020 at 5:11 PM Marc Zyngier <[email protected]> wrote:
>>>>>>
>>>>>>> Can you try the patch below and let me know?
>>>>>> I tried this patch and now Ux500 WORKS. So this patch is definitely
>>>>>> something you should apply.
>>>>>>
>>>>>>> -                       if (is_frankengic())
>>>>>>> -                               set_sgi_intid(irqstat);
>>>>>>> +                       this_cpu_write(sgi_intid, intid);
>>>>>> This needs changing to irqstat to compile as pointed out by Jon.
>>>>>>
>>>>>> With that:
>>>>>> Tested-by: Linus Walleij <[email protected]>
>>>>> Thanks a lot for that.
>>>>>
>>>>> Still need to understand why some of Jon's systems are left unbootable,
>>>>> despite having similar GIC implementations (Tegra194 and Tegra210 use
>>>>> the same GIC-400, and yet only one of the two boots correctly...).
>>>> So far, I have only tested this patch on Tegra20. Let me try the other
>>>> failing boards this morning and see if those still fail.
>>> Tegra20 (if I remember well) is a dual A9 with the same GIC implementation
>>> as Ux500, hence requiring the source CPU bits to be written back. So this
>>> patch should have cured it, but didn't...
>>>
>>> /me puzzled.
>> Me too. Maybe there just happens to be something else also going wrong
>> in next. I am doing a bit more testing to see if applying the fix
>> directly on top of this change fixes it to try and eliminate anything
>> else in -next.
>>
>> Linus, what -next are you testing on? I am using next-20200916.
>
> next-20200916 completely broken on ARM and ARM64. Please check
> next-20200915 + the mentioned fix or just check
> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq

OK, I have confirmed that on Tegra20 and Tegra30, that next-20200915 +
Marc's fix boots fine.

Tegra186 and Tegra194 are not booting, but I am wondering if this is yet
another issue that is not related. I have not actually bisected on these
boards, but I am now bisecting on Tegra186 to confirm.

Jon

--
nvpublic

2020-09-18 09:53:04

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

On 2020-09-17 19:24, Jon Hunter wrote:
> On 17/09/2020 15:53, Jon Hunter wrote:
>
> ...
>
>>> next-20200916 completely broken on ARM and ARM64. Please check
>>> next-20200915 + the mentioned fix or just check
>>> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-as-irq
>>
>> OK, I have confirmed that on Tegra20 and Tegra30, that next-20200915 +
>> Marc's fix boots fine.
>>
>> Tegra186 and Tegra194 are not booting, but I am wondering if this is
>> yet
>> another issue that is not related. I have not actually bisected on
>> these
>> boards, but I am now bisecting on Tegra186 to confirm.
>
> Hmm ... well bisect on Tegra186 is also pointing to this commit, but
> the
> fix is not working there. Wonder what's going on with this gic-400?

It's not GIC400. I have tons of machines with GIC400 around me, and they
don't even need the fix. What happens if you only boot the non-Denver
CPUs?

M.
--
Jazz is not dead. It just smells funny...

2020-09-18 10:00:43

by James Morse

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi Marc,

(CC: +Jon)

On 01/09/2020 15:43, Marc Zyngier wrote:
> Change the way we deal with GIC SGIs by turning them into proper
> IRQs, and calling into the arch code to register the interrupt range
> instead of a callback.

Your comment "This only works because we don't nest SGIs..." on this thread tripped some
bad memories from adding the irq-stack. Softirq causes us to nest irqs, but only once.


(I've messed with the below diff to remove the added stuff:)

> diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> index 4ffd62af888f..4be2b62f816f 100644
> --- a/drivers/irqchip/irq-gic.c
> +++ b/drivers/irqchip/irq-gic.c
> @@ -335,31 +335,22 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
> irqstat = readl_relaxed(cpu_base + GIC_CPU_INTACK);
> irqnr = irqstat & GICC_IAR_INT_ID_MASK;
>
> - if (likely(irqnr > 15 && irqnr < 1020)) {
> - if (static_branch_likely(&supports_deactivate_key))
> - writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
> - isb();
> - handle_domain_irq(gic->domain, irqnr, regs);
> - continue;
> - }
> - if (irqnr < 16) {
> writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
> - if (static_branch_likely(&supports_deactivate_key))
> - writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
> -#ifdef CONFIG_SMP
> - /*
> - * Ensure any shared data written by the CPU sending
> - * the IPI is read after we've read the ACK register
> - * on the GIC.
> - *
> - * Pairs with the write barrier in gic_raise_softirq
> - */
> smp_rmb();
> - handle_IPI(irqnr, regs);

If I read this right, previously we would EOI the interrupt before calling handle_IPI().
Where as now with the version of this series in your tree, we stuff the to-be-EOId value
in a percpu variable, which is only safe if these don't nest.

Hidden in irq_exit(), kernel/softirq.c::__irq_exit_rcu() has this:
| preempt_count_sub(HARDIRQ_OFFSET);
| if (!in_interrupt() && local_softirq_pending())
| invoke_softirq();

The arch code doesn't raise the preempt counter by HARDIRQ, so once __irq_exit_rcu() has
dropped it, in_interrupt() returns false, and we invoke_softirq().

invoke_softirq() -> __do_softirq() -> local_irq_enable()!

Fortunately, __do_softirq() raises the softirq count first using __local_bh_disable_ip(),
which in-interrupt() checks too, so this can only happen once per IRQ.

Now the irq_exit() has moved from handle_IPI(), which ran after EOI, into
handle_domain_irq(), which runs before. I think its possible SGIs nest, and the new percpu
variable becomes corrupted.

Presumably this isn't a problem for regular IRQ, as they don't need the sending-CPU in
order to EOI, which is why it wasn't a problem before.

Adding anything to preempt-count around the whole thing upsets RCU, and softirq seems to
expect this nesting, but evidently the gic does not. I'm not sure what the right thing to
do would be. A dirty hack like [0] would confirm the theory.

/me runs

Thanks,

James



[0] A dirty hack
-----------%<-----------
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf88d7f62433..50e14d8cbec3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -376,7 +376,7 @@ static inline void invoke_softirq(void)
if (ksoftirqd_running(local_softirq_pending()))
return;

- if (!force_irqthreads) {
+ if (false) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
@@ -393,6 +393,7 @@ static inline void invoke_softirq(void)
do_softirq_own_stack();
#endif
} else {
+ /* hack: force this */
wakeup_softirqd();
}
}
-----------%<-----------

2020-09-18 10:24:52

by Marc Zyngier

[permalink] [raw]
Subject: Re: [PATCH v3 08/16] irqchip/gic: Configure SGIs as standard interrupts

Hi James,

On Fri, 18 Sep 2020 10:58:45 +0100,
James Morse <[email protected]> wrote:
>
> Hi Marc,
>
> (CC: +Jon)
>
> On 01/09/2020 15:43, Marc Zyngier wrote:
> > Change the way we deal with GIC SGIs by turning them into proper
> > IRQs, and calling into the arch code to register the interrupt range
> > instead of a callback.
>
> Your comment "This only works because we don't nest SGIs..." on this
> thread tripped some bad memories from adding the irq-stack. Softirq
> causes us to nest irqs, but only once.
>
>
> (I've messed with the below diff to remove the added stuff:)
>
> > diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
> > index 4ffd62af888f..4be2b62f816f 100644
> > --- a/drivers/irqchip/irq-gic.c
> > +++ b/drivers/irqchip/irq-gic.c
> > @@ -335,31 +335,22 @@ static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
> > irqstat = readl_relaxed(cpu_base + GIC_CPU_INTACK);
> > irqnr = irqstat & GICC_IAR_INT_ID_MASK;
> >
> > - if (likely(irqnr > 15 && irqnr < 1020)) {
> > - if (static_branch_likely(&supports_deactivate_key))
> > - writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
> > - isb();
> > - handle_domain_irq(gic->domain, irqnr, regs);
> > - continue;
> > - }
> > - if (irqnr < 16) {
> > writel_relaxed(irqstat, cpu_base + GIC_CPU_EOI);
> > - if (static_branch_likely(&supports_deactivate_key))
> > - writel_relaxed(irqstat, cpu_base + GIC_CPU_DEACTIVATE);
> > -#ifdef CONFIG_SMP
> > - /*
> > - * Ensure any shared data written by the CPU sending
> > - * the IPI is read after we've read the ACK register
> > - * on the GIC.
> > - *
> > - * Pairs with the write barrier in gic_raise_softirq
> > - */
> > smp_rmb();
> > - handle_IPI(irqnr, regs);
>
> If I read this right, previously we would EOI the interrupt before
> calling handle_IPI(). Where as now with the version of this series
> in your tree, we stuff the to-be-EOId value in a percpu variable,
> which is only safe if these don't nest.
>
> Hidden in irq_exit(), kernel/softirq.c::__irq_exit_rcu() has this:
> | preempt_count_sub(HARDIRQ_OFFSET);
> | if (!in_interrupt() && local_softirq_pending())
> | invoke_softirq();
>
> The arch code doesn't raise the preempt counter by HARDIRQ, so once
> __irq_exit_rcu() has dropped it, in_interrupt() returns false, and
> we invoke_softirq().
>
> invoke_softirq() -> __do_softirq() -> local_irq_enable()!
>
> Fortunately, __do_softirq() raises the softirq count first using
> __local_bh_disable_ip(), which in-interrupt() checks too, so this
> can only happen once per IRQ.
>
> Now the irq_exit() has moved from handle_IPI(), which ran after EOI,
> into handle_domain_irq(), which runs before. I think its possible
> SGIs nest, and the new percpu variable becomes corrupted.

I can't see how. The interrupt is active until we EOI/deactivate it,
and thus cannot be observed again by the CPU interface until this
happens.

Furthermore, irq_exit() in __handle_domain_irq() is *after* the EOI
anyway (generic_handle_irq_() directly calls the flow, which
immediately EOIs the interrupt). The only material change is that
irq_enter() happens before EOI. Is that what you are referring to?

Thanks,

M.

--
Without deviation from the norm, progress is not possible.