Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
Date:   Wed, 21 Sep 2022 19:29:21 +0100
Message-ID: <87k05wk2da.wl-maz@kernel.org>
From:   Marc Zyngier <maz@kernel.org>
To:     Anup Patel <apatel@ventanamicro.com>
Cc:     Palmer Dabbelt <palmer@dabbelt.com>,
        Paul Walmsley <paul.walmsley@sifive.com>,
        Thomas Gleixner <tglx@linutronix.de>,
        Daniel Lezcano <daniel.lezcano@linaro.org>,
        Atish Patra <atishp@atishpatra.org>,
        Alistair Francis <Alistair.Francis@wdc.com>,
        Anup Patel <anup@brainfault.org>,
        linux-riscv@lists.infradead.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v9 3/7] genirq: Add mechanism to multiplex a single HW IPI
In-Reply-To: <20220903161309.32848-4-apatel@ventanamicro.com>
References: <20220903161309.32848-1-apatel@ventanamicro.com>
        <20220903161309.32848-4-apatel@ventanamicro.com>
User-Agent: Wanderlust/2.15.9 (Almost Unreal) SEMI-EPG/1.14.7 (Harue)
 FLIM-LB/1.14.9 (=?UTF-8?B?R29qxY0=?=) APEL-LB/10.8 EasyPG/1.0.0 Emacs/27.1
 (x86_64-pc-linux-gnu) MULE/6.0 (HANACHIRUSATO)
MIME-Version: 1.0 (generated by SEMI-EPG 1.14.7 - "Harue")
Content-Type: text/plain; charset=US-ASCII
Precedence: bulk

On Sat, 03 Sep 2022 17:13:05 +0100,
Anup Patel <apatel@ventanamicro.com> wrote:
> 
> All RISC-V platforms have a single HW IPI provided by the INTC local
> interrupt controller. The HW method to trigger INTC IPI can be through
> external irqchip (e.g. RISC-V AIA), through platform specific device
> (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> 
> To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> mechanism which help us create multiple virtual IPIs using a single
> HW IPI. This generic IPI multiplexing is shared among various RISC-V
> irqchip drivers.
> 
> Signed-off-by: Anup Patel <apatel@ventanamicro.com>
> ---
>  include/linux/irq.h  |  18 ++++
>  kernel/irq/Kconfig   |   5 +
>  kernel/irq/Makefile  |   1 +
>  kernel/irq/ipi-mux.c | 244 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 268 insertions(+)
>  create mode 100644 kernel/irq/ipi-mux.c
> 
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c3eb89606c2b..5ab702cb0a5b 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -1266,6 +1266,24 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
>  int ipi_send_single(unsigned int virq, unsigned int cpu);
>  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
>  
> +/**
> + * struct ipi_mux_ops - IPI multiplex operations
> + *
> + * @ipi_mux_pre_handle:	Optional function called before handling parent IPI
> + * @ipi_mux_post_handle:Optional function called after handling parent IPI
> + * @ipi_mux_send:	Trigger parent IPI on target CPUs
> + */
> +struct ipi_mux_ops {
> +	void (*ipi_mux_pre_handle)(unsigned int parent_virq, void *data);
> +	void (*ipi_mux_post_handle)(unsigned int parent_virq, void *data);
> +	void (*ipi_mux_send)(unsigned int parent_virq, void *data,
> +			     const struct cpumask *mask);
> +};

What is the rational for these pre/post handling functions? We don't
have them for normal interrupts, why are they required for IPIs?

> +
> +void ipi_mux_process(void);
> +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
> +		   const struct ipi_mux_ops *ops, void *data);
> +
>  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
>  /*
>   * Registers a generic IRQ handling function as the top-level IRQ handler in
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index db3d174c53d4..df17dbc54b02 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
>  	depends on SMP
>  	select IRQ_DOMAIN_HIERARCHY
>  
> +# Generic IRQ IPI Mux support
> +config GENERIC_IRQ_IPI_MUX
> +	bool
> +	depends on SMP
> +
>  # Generic MSI interrupt support
>  config GENERIC_MSI_IRQ
>  	bool
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index b4f53717d143..f19d3080bf11 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
>  obj-$(CONFIG_SMP) += affinity.o
>  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
>  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> new file mode 100644
> index 000000000000..8939fa2be73c
> --- /dev/null
> +++ b/kernel/irq/ipi-mux.c
> @@ -0,0 +1,244 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Multiplex several virtual IPIs over a single HW IPI.
> + *
> + * Copyright (c) 2022 Ventana Micro Systems Inc.

Basic courtesy would be to at least mention where this was lifted
from, as well as the original copyrights. Also, the original code
states "GPL-2.0-or-later". Since this is a derivative work, the same
licence should apply, shouldn't it?

> + */
> +
> +#define pr_fmt(fmt) "ipi-mux: " fmt
> +#include <linux/cpu.h>
> +#include <linux/init.h>
> +#include <linux/irq.h>
> +#include <linux/irqchip.h>
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/smp.h>
> +
> +static void *ipi_mux_data;
> +static unsigned int ipi_mux_nr;
> +static unsigned int ipi_mux_parent_virq;
> +static struct irq_domain *ipi_mux_domain;
> +static const struct ipi_mux_ops *ipi_mux_ops;
> +static DEFINE_PER_CPU(atomic_t, ipi_mux_enable);
> +static DEFINE_PER_CPU(atomic_t, ipi_mux_bits);

How about making these fields part of a structure, and only allocate
them on creation of an IPI irqdomain? Most platforms do not require
this at all.

> +
> +static void ipi_mux_mask(struct irq_data *d)
> +{
> +	atomic_andnot(BIT(irqd_to_hwirq(d)), this_cpu_ptr(&ipi_mux_enable));
> +}
> +
> +static void ipi_mux_unmask(struct irq_data *d)
> +{
> +	u32 ipi_bit = BIT(irqd_to_hwirq(d));
> +
> +	atomic_or(ipi_bit, this_cpu_ptr(&ipi_mux_enable));
> +
> +	/*
> +	 * The atomic_or() above must complete before the atomic_read()
> +	 * below to avoid racing ipi_mux_send_mask().
> +	 */
> +	smp_mb__after_atomic();
> +
> +	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
> +	if (atomic_read(this_cpu_ptr(&ipi_mux_bits)) & ipi_bit)
> +		ipi_mux_ops->ipi_mux_send(ipi_mux_parent_virq, ipi_mux_data,
> +					  cpumask_of(smp_processor_id()));
> +}
> +
> +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> +{
> +	u32 ipi_bit = BIT(irqd_to_hwirq(d));
> +	struct cpumask pmask = { 0 };

Urgh, no (see the comment in cpumask.h). struct cpumask can be really
huge (it uses NR_CPUS), and we only want to depend on nr_cpus. Use a
per-CPU temp if you must, but don't allocate something like this on
the stack.

> +	unsigned long pending;
> +	int cpu;
> +
> +	for_each_cpu(cpu, mask) {
> +		pending = atomic_fetch_or_release(ipi_bit,
> +					per_cpu_ptr(&ipi_mux_bits, cpu));
> +
> +		/*
> +		 * The atomic_fetch_or_release() above must complete before
> +		 * the atomic_read() below to avoid racing ipi_mux_unmask().
> +		 */
> +		smp_mb__after_atomic();
> +
> +		if (!(pending & ipi_bit) &&
> +		    (atomic_read(per_cpu_ptr(&ipi_mux_enable, cpu)) & ipi_bit))
> +			cpumask_set_cpu(cpu, &pmask);
> +	}
> +
> +	/* Trigger the parent IPI */
> +	ipi_mux_ops->ipi_mux_send(ipi_mux_parent_virq, ipi_mux_data, &pmask);
> +}
> +
> +static const struct irq_chip ipi_mux_chip = {
> +	.name		= "IPI Mux",
> +	.irq_mask	= ipi_mux_mask,
> +	.irq_unmask	= ipi_mux_unmask,
> +	.ipi_send_mask	= ipi_mux_send_mask,
> +};
> +
> +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> +				unsigned int nr_irqs, void *arg)
> +{
> +	struct irq_fwspec *fwspec = arg;
> +	irq_hw_number_t hwirq;
> +	unsigned int type;
> +	int i, ret;
> +
> +	ret = irq_domain_translate_onecell(d, fwspec, &hwirq, &type);

Where is the format of this fwspec documented?

> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		irq_set_percpu_devid(virq + i);
> +		irq_domain_set_info(d, virq + i, hwirq + i,
> +				    &ipi_mux_chip, d->host_data,
> +				    handle_percpu_devid_irq, NULL, NULL);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct irq_domain_ops ipi_mux_domain_ops = {
> +	.alloc		= ipi_mux_domain_alloc,
> +	.free		= irq_domain_free_irqs_top,
> +};
> +
> +/**
> + * ipi_mux_process - Process multiplexed virtual IPIs
> + */
> +void ipi_mux_process(void)
> +{
> +	irq_hw_number_t hwirq;
> +	unsigned long ipis;
> +	int en, err;
> +
> +	if (ipi_mux_ops->ipi_mux_pre_handle)

Given that this shouldn't change in the lifetime of the system, this
needs to be turned into a static key in order to avoid the overhead of
accessing this on each and every IPI.

> +		ipi_mux_ops->ipi_mux_pre_handle(ipi_mux_parent_virq,
> +						ipi_mux_data);
> +
> +	/*
> +	 * Reading enable mask does not need to be ordered as long as
> +	 * this function called from interrupt handler because only
> +	 * the CPU itself can change it's own enable mask.
> +	 */
> +	en = atomic_read(this_cpu_ptr(&ipi_mux_enable));
> +
> +	/*
> +	 * Clear the IPIs we are about to handle. This pairs with the
> +	 * atomic_fetch_or_release() in ipi_mux_send_mask().
> +	 */
> +	ipis = atomic_fetch_andnot(en, this_cpu_ptr(&ipi_mux_bits)) & en;
> +
> +	for_each_set_bit(hwirq, &ipis, ipi_mux_nr) {
> +		err = generic_handle_domain_irq(ipi_mux_domain,
> +						hwirq);
> +		if (unlikely(err))
> +			pr_warn_ratelimited(
> +				"can't find mapping for hwirq %lu\n",
> +				hwirq);

How can this happen?

> +	}
> +
> +	if (ipi_mux_ops->ipi_mux_post_handle)

Same thing as above.

> +		ipi_mux_ops->ipi_mux_post_handle(ipi_mux_parent_virq,
> +						 ipi_mux_data);
> +}
> +
> +static void ipi_mux_handler(struct irq_desc *desc)
> +{
> +	struct irq_chip *chip = irq_desc_get_chip(desc);
> +
> +	chained_irq_enter(chip, desc);
> +	ipi_mux_process();
> +	chained_irq_exit(chip, desc);
> +}
> +
> +static int ipi_mux_dying_cpu(unsigned int cpu)
> +{
> +	disable_percpu_irq(ipi_mux_parent_virq);
> +	return 0;
> +}
> +
> +static int ipi_mux_starting_cpu(unsigned int cpu)
> +{
> +	enable_percpu_irq(ipi_mux_parent_virq,
> +			  irq_get_trigger_type(ipi_mux_parent_virq));
> +	return 0;
> +}
> +
> +/**
> + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> + * parent IPI.
> + * @parent_virq:	virq of the parent per-CPU IRQ
> + * @nr_ipi:		number of virtual IPIs to create. This should
> + *			be <= BITS_PER_TYPE(int)

In general, please use 'long' rather than 'int', like for most other
things in the kernel.

> + * @ops:		multiplexing operations for the parent IPI
> + * @data:		opaque data used by the multiplexing operations
> + *
> + * If the parent IPI > 0 then ipi_mux_process() will be automatically
> + * called via chained handler.
> + *
> + * If the parent IPI <= 0 then it is responsibility of irqchip drivers
> + * to explicitly call ipi_mux_process() for processing muxed IPIs.
> + *
> + * Returns first virq of the newly created virtual IPIs upon success
> + * or <=0 upon failure
> + */
> +int ipi_mux_create(unsigned int parent_virq, unsigned int nr_ipi,
> +		   const struct ipi_mux_ops *ops, void *data)
> +{
> +	struct fwnode_handle *fwnode;
> +	struct irq_domain *domain;
> +	struct irq_fwspec ipi;
> +	int virq;
> +
> +	if (ipi_mux_domain || BITS_PER_TYPE(int) < nr_ipi ||
> +	    !ops || !ops->ipi_mux_send)
> +		return -EINVAL;
> +
> +	if (parent_virq &&
> +	    !irqd_is_per_cpu(irq_desc_get_irq_data(irq_to_desc(parent_virq))))
> +		return -EINVAL;
> +
> +	fwnode = irq_domain_alloc_named_fwnode("IPI-Mux");
> +	if (!fwnode) {
> +		pr_err("unable to create IPI Mux fwnode\n");
> +		return -ENOMEM;
> +	}
> +
> +	domain = irq_domain_create_simple(fwnode, nr_ipi, 0,
> +					  &ipi_mux_domain_ops, NULL);
> +	if (!domain) {
> +		pr_err("unable to add IPI Mux domain\n");
> +		irq_domain_free_fwnode(fwnode);
> +		return -ENOMEM;
> +	}
> +
> +	ipi.fwnode = domain->fwnode;
> +	ipi.param_count = 1;
> +	ipi.param[0] = 0;

This really deserves some documentation if this is to be used at scale
(and not be a riscv-special).

> +	virq = __irq_domain_alloc_irqs(domain, -1, nr_ipi,
> +				       NUMA_NO_NODE, &ipi, false, NULL);
> +	if (virq <= 0) {
> +		pr_err("unable to alloc IRQs from IPI Mux domain\n");
> +		irq_domain_remove(domain);
> +		irq_domain_free_fwnode(fwnode);
> +		return virq;
> +	}
> +
> +	ipi_mux_domain = domain;
> +	ipi_mux_data = data;
> +	ipi_mux_nr = nr_ipi;
> +	ipi_mux_parent_virq = parent_virq;
> +	ipi_mux_ops = ops;
> +
> +	if (parent_virq > 0) {
> +		irq_set_chained_handler(parent_virq, ipi_mux_handler);
> +		cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> +				  "irqchip/ipi-mux:starting",
> +				  ipi_mux_starting_cpu, ipi_mux_dying_cpu);
> +	}

This whole CPUHP thing feels like something that should stay outside
of the generic code. I'm pretty sure some architectures would have
specific requirements and wouldn't be able to just use any odd dynamic
value...

> +
> +	return virq;
> +}

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.