2021-08-03 01:13:47

by Guo Ren

[permalink] [raw]
Subject: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context

From: Guo Ren <[email protected]>

The current plic driver would touch offline CPU context and cause
bus error in some chip when in CPU hotplug scenario.

This patch fixes up the problem and prevents plic access offline
CPU context in plic_init() & plic_set_affinity().

Signed-off-by: Guo Ren <[email protected]>
Cc: Anup Patel <[email protected]>
Cc: Atish Patra <[email protected]>
Cc: Greentime Hu <[email protected]>
Cc: Marc Zyngier <[email protected]>
---
drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
index cf74cfa..9c9bb20 100644
--- a/drivers/irqchip/irq-sifive-plic.c
+++ b/drivers/irqchip/irq-sifive-plic.c
@@ -64,6 +64,7 @@ struct plic_priv {
struct cpumask lmask;
struct irq_domain *irqdomain;
void __iomem *regs;
+ unsigned int nr_irqs;
};

struct plic_handler {
@@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
if (cpu >= nr_cpu_ids)
return -EINVAL;

- plic_irq_toggle(&priv->lmask, d, 0);
+ plic_irq_toggle(cpu_online_mask, d, 0);
plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));

irq_data_update_effective_affinity(d, cpumask_of(cpu));
@@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)

static int plic_dying_cpu(unsigned int cpu)
{
+ struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
+
if (plic_parent_irq)
disable_percpu_irq(plic_parent_irq);

+ handler->present = false;
+
return 0;
}

static int plic_starting_cpu(unsigned int cpu)
{
struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
+ irq_hw_number_t hwirq;
+
+ handler->present = true;
+
+ for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
+ plic_toggle(handler, hwirq, 0);

if (plic_parent_irq)
enable_percpu_irq(plic_parent_irq,
@@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
struct device_node *parent)
{
int error = 0, nr_contexts, nr_handlers = 0, i;
- u32 nr_irqs;
struct plic_priv *priv;
struct plic_handler *handler;

@@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
}

error = -EINVAL;
- of_property_read_u32(node, "riscv,ndev", &nr_irqs);
- if (WARN_ON(!nr_irqs))
+ of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
+ if (WARN_ON(!priv->nr_irqs))
goto out_iounmap;

nr_contexts = of_irq_count(node);
@@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
goto out_iounmap;

error = -ENOMEM;
- priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
+ priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
&plic_irqdomain_ops, priv);
if (WARN_ON(!priv->irqdomain))
goto out_iounmap;

for (i = 0; i < nr_contexts; i++) {
struct of_phandle_args parent;
- irq_hw_number_t hwirq;
int cpu, hartid;

if (of_irq_parse_one(node, i, &parent)) {
@@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
}

cpumask_set_cpu(cpu, &priv->lmask);
- handler->present = true;
+ if (cpu == smp_processor_id())
+ handler->present = true;
handler->hart_base =
priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
raw_spin_lock_init(&handler->enable_lock);
@@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
handler->priv = priv;
done:
- for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
- plic_toggle(handler, hwirq, 0);
nr_handlers++;
}

--
2.7.4



2021-08-03 01:15:18

by Guo Ren

[permalink] [raw]
Subject: [PATCH 2/2] riscv: Improve status in cpu sections of device-tree for cpuhotplug usage

From: Guo Ren <[email protected]>

If we define cpu sections with first okay & second fail:
cpus {
#address-cells = <1>;
#size-cells = <0>;
timebase-frequency = <3000000>;
cpu@0 {
device_type = "cpu";
reg = <0>;
status = "okay";
...
};
cpu@1 {
device_type = "cpu";
reg = <1>;
status = "fail";
...
};
};

Currently, we only get cpu0 without cpu1 in the shell:
$ ls /sys/bus/cpu/devices/
cpu0

But it should be cpu0 online and cpu1 offline, then we could let
cpu1 online later by shell. This patch fixup the problem and let
us could make cpu1 online in the shell later after system boot:

$ ls /sys/bus/cpu/devices/
cpu0 cpu1
$ cat /sys/bus/cpu/devices/cpu0/online
1
$ cat /sys/bus/cpu/devices/cpu1/online
0

$ echo 224 > /sys/bus/cpu/devices/cpu1/hotplug/target
(cat /sys/devices/system/cpu/hotplug/states "224: online")
$ cat /sys/bus/cpu/devices/cpu1/online
1

Signed-off-by: Guo Ren <[email protected]>
Cc: Anup Patel <[email protected]>
Cc: Atish Patra <[email protected]>
Cc: Palmer Dabbelt <[email protected]>
---
arch/riscv/kernel/cpu.c | 5 -----
arch/riscv/kernel/smpboot.c | 11 +++++++++++
2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 6d59e69..14f63fd 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -27,11 +27,6 @@ int riscv_of_processor_hartid(struct device_node *node)
return -ENODEV;
}

- if (!of_device_is_available(node)) {
- pr_info("CPU with hartid=%d is not available\n", hart);
- return -ENODEV;
- }
-
if (of_property_read_string(node, "riscv,isa", &isa)) {
pr_warn("CPU with hartid=%d has no \"riscv,isa\" property\n", hart);
return -ENODEV;
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index bd82375..c3b620b 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -37,6 +37,7 @@
#include "head.h"

static DECLARE_COMPLETION(cpu_running);
+static struct cpumask cpu_delay_available_mask = { CPU_BITS_NONE };

void __init smp_prepare_boot_cpu(void)
{
@@ -99,6 +100,11 @@ void __init setup_smp(void)
break;
}

+ if (!of_device_is_available(dn))
+ pr_info("CPU with hartid=%d is not available\n", hart);
+ else
+ cpumask_set_cpu(cpuid, &cpu_delay_available_mask);
+
cpuid_to_hartid_map(cpuid) = hart;
early_map_cpu_to_node(cpuid, of_node_to_nid(dn));
cpuid++;
@@ -131,6 +137,11 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
int ret = 0;
tidle->thread_info.cpu = cpu;

+ if (!cpumask_test_cpu(cpu, &cpu_delay_available_mask)) {
+ cpumask_set_cpu(cpu, &cpu_delay_available_mask);
+ return -EIO;
+ }
+
ret = start_secondary_cpu(cpu, tidle);
if (!ret) {
wait_for_completion_timeout(&cpu_running,
--
2.7.4


2021-08-03 05:14:57

by Anup Patel

[permalink] [raw]
Subject: Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context

On Tue, Aug 3, 2021 at 6:42 AM <[email protected]> wrote:
>
> From: Guo Ren <[email protected]>
>
> The current plic driver would touch offline CPU context and cause
> bus error in some chip when in CPU hotplug scenario.
>
> This patch fixes up the problem and prevents plic access offline
> CPU context in plic_init() & plic_set_affinity().
>
> Signed-off-by: Guo Ren <[email protected]>
> Cc: Anup Patel <[email protected]>
> Cc: Atish Patra <[email protected]>
> Cc: Greentime Hu <[email protected]>
> Cc: Marc Zyngier <[email protected]>
> ---
> drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
> 1 file changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
> index cf74cfa..9c9bb20 100644
> --- a/drivers/irqchip/irq-sifive-plic.c
> +++ b/drivers/irqchip/irq-sifive-plic.c
> @@ -64,6 +64,7 @@ struct plic_priv {
> struct cpumask lmask;
> struct irq_domain *irqdomain;
> void __iomem *regs;
> + unsigned int nr_irqs;
> };
>
> struct plic_handler {
> @@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
> if (cpu >= nr_cpu_ids)
> return -EINVAL;
>
> - plic_irq_toggle(&priv->lmask, d, 0);
> + plic_irq_toggle(cpu_online_mask, d, 0);

This breaks RISC-V multi-socket platforms with multiple PLIC instances.

When we have multiple PLIC instances in a RISC-V platform, each PLIC
instance will target a different set of HARTs. The "priv->lmask" represents
the CPUs/HARTs targeted by a given PLIC instance.

I am not sure how you are testing your patches but you certainly need to
test more on QEMU. The QEMU virt machine support multi-socket so make
sure any patch which can potentially affect multi-socket support is at least
tested on QEMU virt machine multi-socket configuration.

> plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));
>
> irq_data_update_effective_affinity(d, cpumask_of(cpu));
> @@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)
>
> static int plic_dying_cpu(unsigned int cpu)
> {
> + struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> +
> if (plic_parent_irq)
> disable_percpu_irq(plic_parent_irq);
>
> + handler->present = false;
> +

Drop these changes in plic_dying_cpu(), see comments below.

> return 0;
> }
>
> static int plic_starting_cpu(unsigned int cpu)
> {
> struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> + irq_hw_number_t hwirq;
> +
> + handler->present = true;

The "handler->present" flag indicates that we have PLIC context
associated with the given handler. It has nothing to do with CPU
hot-plug.

> +
> + for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
> + plic_toggle(handler, hwirq, 0);
>
> if (plic_parent_irq)
> enable_percpu_irq(plic_parent_irq,
> @@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
> struct device_node *parent)
> {
> int error = 0, nr_contexts, nr_handlers = 0, i;
> - u32 nr_irqs;
> struct plic_priv *priv;
> struct plic_handler *handler;
>
> @@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
> }
>
> error = -EINVAL;
> - of_property_read_u32(node, "riscv,ndev", &nr_irqs);
> - if (WARN_ON(!nr_irqs))
> + of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
> + if (WARN_ON(!priv->nr_irqs))
> goto out_iounmap;
>
> nr_contexts = of_irq_count(node);
> @@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
> goto out_iounmap;
>
> error = -ENOMEM;
> - priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
> + priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
> &plic_irqdomain_ops, priv);
> if (WARN_ON(!priv->irqdomain))
> goto out_iounmap;
>
> for (i = 0; i < nr_contexts; i++) {
> struct of_phandle_args parent;
> - irq_hw_number_t hwirq;
> int cpu, hartid;
>
> if (of_irq_parse_one(node, i, &parent)) {
> @@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
> }
>
> cpumask_set_cpu(cpu, &priv->lmask);
> - handler->present = true;
> + if (cpu == smp_processor_id())
> + handler->present = true;

Drop this change.

> handler->hart_base =
> priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
> raw_spin_lock_init(&handler->enable_lock);
> @@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
> priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
> handler->priv = priv;
> done:
> - for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
> - plic_toggle(handler, hwirq, 0);

In plic_init(), we are bringing all interrupts of PLIC context to a known
state which is being disabled by default. We don't need to do this every
time a HART/CPU is brought-up but I am okay to move this to
plic_starting_cpu() if it helps fix issues on any RISC-V platform.

> nr_handlers++;
> }
>
> --
> 2.7.4
>

Regards,
Anup

2021-08-10 07:36:49

by Guo Ren

[permalink] [raw]
Subject: Re: [PATCH 1/2] irqchip/sifive-plic: Fix PLIC crash on touching offline CPU context

Hi Anup,

Sorry for the late reply.

On Tue, Aug 3, 2021 at 1:13 PM Anup Patel <[email protected]> wrote:
>
> On Tue, Aug 3, 2021 at 6:42 AM <[email protected]> wrote:
> >
> > From: Guo Ren <[email protected]>
> >
> > The current plic driver would touch offline CPU context and cause
> > bus error in some chip when in CPU hotplug scenario.
> >
> > This patch fixes up the problem and prevents plic access offline
> > CPU context in plic_init() & plic_set_affinity().
> >
> > Signed-off-by: Guo Ren <[email protected]>
> > Cc: Anup Patel <[email protected]>
> > Cc: Atish Patra <[email protected]>
> > Cc: Greentime Hu <[email protected]>
> > Cc: Marc Zyngier <[email protected]>
> > ---
> > drivers/irqchip/irq-sifive-plic.c | 26 +++++++++++++++++---------
> > 1 file changed, 17 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c
> > index cf74cfa..9c9bb20 100644
> > --- a/drivers/irqchip/irq-sifive-plic.c
> > +++ b/drivers/irqchip/irq-sifive-plic.c
> > @@ -64,6 +64,7 @@ struct plic_priv {
> > struct cpumask lmask;
> > struct irq_domain *irqdomain;
> > void __iomem *regs;
> > + unsigned int nr_irqs;
> > };
> >
> > struct plic_handler {
> > @@ -150,7 +151,7 @@ static int plic_set_affinity(struct irq_data *d,
> > if (cpu >= nr_cpu_ids)
> > return -EINVAL;
> >
> > - plic_irq_toggle(&priv->lmask, d, 0);
> > + plic_irq_toggle(cpu_online_mask, d, 0);
>
> This breaks RISC-V multi-socket platforms with multiple PLIC instances.
Yes, I haven't considered the multi-sockets scenario.

>
> When we have multiple PLIC instances in a RISC-V platform, each PLIC
> instance will target a different set of HARTs. The "priv->lmask" represents
> the CPUs/HARTs targeted by a given PLIC instance.
Okay, I would correct it with:
- plic_irq_toggle(&priv->lmask, d, 0);
+ cpumask_and(&amask, &priv->lmask, cpu_online_mask);
+ plic_irq_toggle(&amask, d, 0);

>
> I am not sure how you are testing your patches but you certainly need to
> test more on QEMU. The QEMU virt machine support multi-socket so make
> sure any patch which can potentially affect multi-socket support is at least
> tested on QEMU virt machine multi-socket configuration.
The patch has been tested with our hardware platforms and qemu 4.1.
But in that version of qemu, riscv didn't support multi-socket.

I would update my qemu environment to follow your steps :)

>
> > plic_irq_toggle(cpumask_of(cpu), d, !irqd_irq_masked(d));
> >
> > irq_data_update_effective_affinity(d, cpumask_of(cpu));
> > @@ -251,15 +252,25 @@ static void plic_set_threshold(struct plic_handler *handler, u32 threshold)
> >
> > static int plic_dying_cpu(unsigned int cpu)
> > {
> > + struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> > +
> > if (plic_parent_irq)
> > disable_percpu_irq(plic_parent_irq);
> >
> > + handler->present = false;
> > +
>
> Drop these changes in plic_dying_cpu(), see comments below.
>
> > return 0;
> > }
> >
> > static int plic_starting_cpu(unsigned int cpu)
> > {
> > struct plic_handler *handler = this_cpu_ptr(&plic_handlers);
> > + irq_hw_number_t hwirq;
> > +
> > + handler->present = true;
>
> The "handler->present" flag indicates that we have PLIC context
> associated with the given handler. It has nothing to do with CPU
> hot-plug.
>
> > +
> > + for (hwirq = 1; hwirq <= handler->priv->nr_irqs; hwirq++)
> > + plic_toggle(handler, hwirq, 0);
> >
> > if (plic_parent_irq)
> > enable_percpu_irq(plic_parent_irq,
> > @@ -275,7 +286,6 @@ static int __init plic_init(struct device_node *node,
> > struct device_node *parent)
> > {
> > int error = 0, nr_contexts, nr_handlers = 0, i;
> > - u32 nr_irqs;
> > struct plic_priv *priv;
> > struct plic_handler *handler;
> >
> > @@ -290,8 +300,8 @@ static int __init plic_init(struct device_node *node,
> > }
> >
> > error = -EINVAL;
> > - of_property_read_u32(node, "riscv,ndev", &nr_irqs);
> > - if (WARN_ON(!nr_irqs))
> > + of_property_read_u32(node, "riscv,ndev", &priv->nr_irqs);
> > + if (WARN_ON(!priv->nr_irqs))
> > goto out_iounmap;
> >
> > nr_contexts = of_irq_count(node);
> > @@ -299,14 +309,13 @@ static int __init plic_init(struct device_node *node,
> > goto out_iounmap;
> >
> > error = -ENOMEM;
> > - priv->irqdomain = irq_domain_add_linear(node, nr_irqs + 1,
> > + priv->irqdomain = irq_domain_add_linear(node, priv->nr_irqs + 1,
> > &plic_irqdomain_ops, priv);
> > if (WARN_ON(!priv->irqdomain))
> > goto out_iounmap;
> >
> > for (i = 0; i < nr_contexts; i++) {
> > struct of_phandle_args parent;
> > - irq_hw_number_t hwirq;
> > int cpu, hartid;
> >
> > if (of_irq_parse_one(node, i, &parent)) {
> > @@ -354,7 +363,8 @@ static int __init plic_init(struct device_node *node,
> > }
> >
> > cpumask_set_cpu(cpu, &priv->lmask);
> > - handler->present = true;
> > + if (cpu == smp_processor_id())
> > + handler->present = true;
>
> Drop this change.
>
> > handler->hart_base =
> > priv->regs + CONTEXT_BASE + i * CONTEXT_PER_HART;
> > raw_spin_lock_init(&handler->enable_lock);
> > @@ -362,8 +372,6 @@ static int __init plic_init(struct device_node *node,
> > priv->regs + ENABLE_BASE + i * ENABLE_PER_HART;
> > handler->priv = priv;
> > done:
> > - for (hwirq = 1; hwirq <= nr_irqs; hwirq++)
> > - plic_toggle(handler, hwirq, 0);
>
> In plic_init(), we are bringing all interrupts of PLIC context to a known
> state which is being disabled by default. We don't need to do this every
> time a HART/CPU is brought-up but I am okay to move this to
> plic_starting_cpu() if it helps fix issues on any RISC-V platform.
>
> > nr_handlers++;
> > }
> >
> > --
> > 2.7.4
> >
>
> Regards,
> Anup



--
Best Regards
Guo Ren

ML: https://lore.kernel.org/linux-csky/