Hi,
Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.
As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.
Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors. Add 'priv' so that driver may retrieve
its private data via the 'struct irq_affinity'.
V2:
- add .calc_sets instead of .setup_affinity() which is easy to
be abused by drivers
Ming Lei (4):
genirq/affinity: store irq set vectors in 'struct irq_affinity'
genirq/affinity: add new callback for caculating set vectors
nvme-pci: avoid irq allocation retrying via .calc_sets
genirq/affinity: Document .calc_sets as required in case of multiple
sets
drivers/nvme/host/pci.c | 63 ++++++++++++-----------------------------------
drivers/pci/msi.c | 4 +--
include/linux/interrupt.h | 13 +++++++---
kernel/irq/affinity.c | 19 +++++++++-----
4 files changed, 41 insertions(+), 58 deletions(-)
--
2.9.5
Currently the array of irq set vectors is provided by driver.
irq_create_affinity_masks() can be simplied a bit by treating the
non-irq-set case as single irq set.
So move this array into 'struct irq_affinity', and pre-define the max
set number as 4, which should be enough for normal cases.
Signed-off-by: Ming Lei <[email protected]>
---
drivers/nvme/host/pci.c | 5 ++---
include/linux/interrupt.h | 8 +++++---
kernel/irq/affinity.c | 10 ++++++----
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022ea1ee63f8..0086bdf80ea1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int irq_sets[2];
struct irq_affinity affd = {
.pre_vectors = 1,
- .nr_sets = ARRAY_SIZE(irq_sets),
- .sets = irq_sets,
+ .nr_sets = 2,
};
+ int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7c9434652f36..a20150627a32 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -259,6 +259,8 @@ struct irq_affinity_notify {
void (*release)(struct kref *ref);
};
+#define IRQ_MAX_SETS 4
+
/**
* struct irq_affinity - Description for automatic irq affinity assignements
* @pre_vectors: Don't apply affinity to @pre_vectors at beginning of
@@ -266,13 +268,13 @@ struct irq_affinity_notify {
* @post_vectors: Don't apply affinity to @post_vectors at end of
* the MSI(-X) vector space
* @nr_sets: Length of passed in *sets array
- * @sets: Number of affinitized sets
+ * @set_vectors: Number of affinitized sets
*/
struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
- int *sets;
+ int set_vectors[IRQ_MAX_SETS];
};
/**
@@ -332,7 +334,7 @@ extern int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(int nvec, struct irq_affinity *affd);
int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 118b66d64a53..a97b7c33d2db 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -239,7 +239,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
* Returns the irq_affinity_desc pointer or NULL if allocation failed.
*/
struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(int nvecs, struct irq_affinity *affd)
{
int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
int curvec, usedvecs;
@@ -265,11 +265,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
* have multiple sets, build each sets affinity mask separately.
*/
nr_sets = affd->nr_sets;
- if (!nr_sets)
+ if (!nr_sets) {
nr_sets = 1;
+ affd->set_vectors[0] = affvecs;
+ }
for (i = 0, usedvecs = 0; i < nr_sets; i++) {
- int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+ int this_vecs = affd->set_vectors[i];
int ret;
ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -316,7 +318,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
int i;
for (i = 0, set_vecs = 0; i < affd->nr_sets; i++)
- set_vecs += affd->sets[i];
+ set_vecs += affd->set_vectors[i];
} else {
get_online_cpus();
set_vecs = cpumask_weight(cpu_possible_mask);
--
2.9.5
Currently pre-caculated set vectors are provided by driver for
allocating & spread vectors. This way only works when drivers passes
same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
also requires driver to retry the allocating & spread.
As Bjorn and Keith mentioned, the current usage & interface for irq sets
is a bit awkward because the retrying should have been avoided by providing
one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
'max_vecs', number of the allocated vectors is unknown before calling
pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
pre-caculated.
Add a new callback of .calc_sets into 'struct irq_affinity' so that
driver can caculate set vectors after IRQ vector is allocated and
before spread IRQ vectors. Add 'priv' so that driver may retrieve
its private data via the 'struct irq_affinity'.
Suggested-by: Thomas Gleixner <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
include/linux/interrupt.h | 4 ++++
kernel/irq/affinity.c | 13 +++++++++----
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a20150627a32..7a27f6ba1f2f 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,12 +269,16 @@ struct irq_affinity_notify {
* the MSI(-X) vector space
* @nr_sets: Length of passed in *sets array
* @set_vectors: Number of affinitized sets
+ * @calc_sets: Callback for caculating set vectors
+ * @priv: Private data of @calc_sets
*/
struct irq_affinity {
int pre_vectors;
int post_vectors;
int nr_sets;
int set_vectors[IRQ_MAX_SETS];
+ void (*calc_sets)(struct irq_affinity *, int nvecs);
+ void *priv;
};
/**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index a97b7c33d2db..34abba63df4d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -264,11 +264,14 @@ irq_create_affinity_masks(int nvecs, struct irq_affinity *affd)
* Spread on present CPUs starting from affd->pre_vectors. If we
* have multiple sets, build each sets affinity mask separately.
*/
- nr_sets = affd->nr_sets;
- if (!nr_sets) {
+ if (affd->calc_sets) {
+ affd->calc_sets(affd, nvecs);
+ nr_sets = affd->nr_sets;
+ } else if (!affd->nr_sets) {
nr_sets = 1;
affd->set_vectors[0] = affvecs;
- }
+ } else
+ nr_sets = affd->nr_sets;
for (i = 0, usedvecs = 0; i < nr_sets; i++) {
int this_vecs = affd->set_vectors[i];
@@ -314,7 +317,9 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
if (resv > minvec)
return 0;
- if (affd->nr_sets) {
+ if (affd->calc_sets) {
+ set_vecs = vecs;
+ } else if (affd->nr_sets) {
int i;
for (i = 0, set_vecs = 0; i < affd->nr_sets; i++)
--
2.9.5
Now NVMe has implemented the .calc_sets callback for caculating each
set's vectors.
For other cases of multiple irq sets, it isn't a good way to pre-caculate
each set's vectors before allocating IRQ vectors because NVMe's same issue
exists too.
Document .calc_sets as required explicitly for multiple sets.
Signed-off-by: Ming Lei <[email protected]>
---
drivers/pci/msi.c | 4 ++--
include/linux/interrupt.h | 3 ++-
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..9f91fa713141 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1039,7 +1039,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
* If the caller is passing in sets, we can't support a range of
* vectors. The caller needs to handle that.
*/
- if (affd && affd->nr_sets && minvec != maxvec)
+ if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
if (WARN_ON_ONCE(dev->msi_enabled))
@@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
* If the caller is passing in sets, we can't support a range of
* supported vectors. The caller needs to handle that.
*/
- if (affd && affd->nr_sets && minvec != maxvec)
+ if (affd && affd->nr_sets > 1 && !affd->calc_sets)
return -EINVAL;
if (WARN_ON_ONCE(dev->msix_enabled))
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 7a27f6ba1f2f..a053f7fb0ff1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -269,7 +269,8 @@ struct irq_affinity_notify {
* the MSI(-X) vector space
* @nr_sets: Length of passed in *sets array
* @set_vectors: Number of affinitized sets
- * @calc_sets: Callback for caculating set vectors
+ * @calc_sets: Callback for caculating set vectors, required for
+ * multiple irq sets.
* @priv: Private data of @calc_sets
*/
struct irq_affinity {
--
2.9.5
Currently pre-caculate each set vectors, and this way requires same
'max_vecs' and 'min_vecs' passed to pci_alloc_irq_vectors_affinity(),
then nvme_setup_irqs() has to retry in case of allocation failure.
This usage & interface is a bit awkward because the retry should have
been avoided by providing one reasonable 'min_vecs'.
Implement the callback of .calc_sets, so that pci_alloc_irq_vectors_affinity()
can calculate each set's vector after IRQ vectors is allocated and
before spread IRQ, then NVMe's retry in case of irq allocation failure
can be removed.
Signed-off-by: Ming Lei <[email protected]>
---
drivers/nvme/host/pci.c | 62 +++++++++++++------------------------------------
1 file changed, 16 insertions(+), 46 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0086bdf80ea1..ca381894542a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2078,14 +2078,25 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
}
}
+static void nvme_calc_irq_sets(struct irq_affinity *affd, int nvecs)
+{
+ struct nvme_dev *dev = affd->priv;
+
+ nvme_calc_io_queues(dev, nvecs);
+
+ affd->set_vectors[HCTX_TYPE_DEFAULT] = dev->io_queues[HCTX_TYPE_DEFAULT];
+ affd->set_vectors[HCTX_TYPE_READ] = dev->io_queues[HCTX_TYPE_READ];
+ affd->nr_sets = HCTX_TYPE_POLL;
+}
+
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
struct irq_affinity affd = {
.pre_vectors = 1,
- .nr_sets = 2,
+ .calc_sets = nvme_calc_irq_sets,
+ .priv = dev,
};
- int *irq_sets = affd.set_vectors;
int result = 0;
unsigned int irq_queues, this_p_queues;
@@ -2102,50 +2113,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
- /*
- * For irq sets, we have to ask for minvec == maxvec. This passes
- * any reduction back to us, so we can adjust our queue counts and
- * IRQ vector needs.
- */
- do {
- nvme_calc_io_queues(dev, irq_queues);
- irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
- irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
- if (!irq_sets[1])
- affd.nr_sets = 1;
-
- /*
- * If we got a failure and we're down to asking for just
- * 1 + 1 queues, just ask for a single vector. We'll share
- * that between the single IO queue and the admin queue.
- * Otherwise, we assign one independent vector to admin queue.
- */
- if (irq_queues > 1)
- irq_queues = irq_sets[0] + irq_sets[1] + 1;
-
- result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
- irq_queues,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
- /*
- * Need to reduce our vec counts. If we get ENOSPC, the
- * platform should support mulitple vecs, we just need
- * to decrease our ask. If we get EINVAL, the platform
- * likely does not. Back down to ask for just one vector.
- */
- if (result == -ENOSPC) {
- irq_queues--;
- if (!irq_queues)
- return result;
- continue;
- } else if (result == -EINVAL) {
- irq_queues = 1;
- continue;
- } else if (result <= 0)
- return -EIO;
- break;
- } while (1);
-
+ result = pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
return result;
}
@@ -3021,6 +2990,7 @@ static struct pci_driver nvme_driver = {
static int __init nvme_init(void)
{
+ BUILD_BUG_ON(HCTX_TYPE_POLL > IRQ_MAX_SETS);
return pci_register_driver(&nvme_driver);
}
--
2.9.5
On Tue, 12 Feb 2019, Ming Lei wrote:
> Currently the array of irq set vectors is provided by driver.
>
> irq_create_affinity_masks() can be simplied a bit by treating the
> non-irq-set case as single irq set.
>
> So move this array into 'struct irq_affinity', and pre-define the max
> set number as 4, which should be enough for normal cases.
You really want to add some sanity check which makes sure, that nr_sets is
<= IRQ_MAX_SETS.
Thanks,
tglx
On Tue, 12 Feb 2019, Ming Lei wrote:
> Currently pre-caculated set vectors are provided by driver for
> allocating & spread vectors. This way only works when drivers passes
> same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
> also requires driver to retry the allocating & spread.
>
> As Bjorn and Keith mentioned, the current usage & interface for irq sets
> is a bit awkward because the retrying should have been avoided by providing
> one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
> 'max_vecs', number of the allocated vectors is unknown before calling
> pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
> pre-caculated.
>
> Add a new callback of .calc_sets into 'struct irq_affinity' so that
> driver can caculate set vectors after IRQ vector is allocated and
> before spread IRQ vectors. Add 'priv' so that driver may retrieve
> its private data via the 'struct irq_affinity'.
Nice changelog!
> * Spread on present CPUs starting from affd->pre_vectors. If we
> * have multiple sets, build each sets affinity mask separately.
> */
> - nr_sets = affd->nr_sets;
> - if (!nr_sets) {
> + if (affd->calc_sets) {
> + affd->calc_sets(affd, nvecs);
> + nr_sets = affd->nr_sets;
> + } else if (!affd->nr_sets) {
> nr_sets = 1;
> affd->set_vectors[0] = affvecs;
> - }
> + } else
> + nr_sets = affd->nr_sets;
Lacks curly brackets.
Aside of that it might make sense to get rid of the local variable, but
that should be done in patch 1 if at all. No strong opinion though.
Thanks,
tglx
On Tue, 12 Feb 2019, Ming Lei wrote:
> Hi,
>
> Currently pre-caculated set vectors are provided by driver for
> allocating & spread vectors. This way only works when drivers passes
> same 'max_vecs' and 'min_vecs' to pci_alloc_irq_vectors_affinity(),
> also requires driver to retry the allocating & spread.
>
> As Bjorn and Keith mentioned, the current usage & interface for irq sets
> is a bit awkward because the retrying should have been avoided by providing
> one resonable 'min_vecs'. However, if 'min_vecs' isn't same with
> 'max_vecs', number of the allocated vectors is unknown before calling
> pci_alloc_irq_vectors_affinity(), then each set's vectors can't be
> pre-caculated.
>
> Add a new callback of .calc_sets into 'struct irq_affinity' so that
> driver can caculate set vectors after IRQ vector is allocated and
> before spread IRQ vectors. Add 'priv' so that driver may retrieve
> its private data via the 'struct irq_affinity'.
>
>
> V2:
> - add .calc_sets instead of .setup_affinity() which is easy to
> be abused by drivers
This looks really well done. If you can address the minor nitpicks, then
this is good to go, unless someone has objections.
Thanks,
tglx
On Tue, Feb 12, 2019 at 05:04:38AM -0800, Ming Lei wrote:
> Currently pre-caculate each set vectors, and this way requires same
> 'max_vecs' and 'min_vecs' passed to pci_alloc_irq_vectors_affinity(),
> then nvme_setup_irqs() has to retry in case of allocation failure.
>
> This usage & interface is a bit awkward because the retry should have
> been avoided by providing one reasonable 'min_vecs'.
>
> Implement the callback of .calc_sets, so that pci_alloc_irq_vectors_affinity()
> can calculate each set's vector after IRQ vectors is allocated and
> before spread IRQ, then NVMe's retry in case of irq allocation failure
> can be removed.
>
> Signed-off-by: Ming Lei <[email protected]>
Thanks, Ming, this whole series looks like a great improvement for
drivers using irq sets.
Minor nit below. Otherwise you may add my review for the whole series
if you spin a v3 for the other minor comments.
Reviewed-by: Keith Busch <[email protected]>
> +static void nvme_calc_irq_sets(struct irq_affinity *affd, int nvecs)
> +{
> + struct nvme_dev *dev = affd->priv;
> +
> + nvme_calc_io_queues(dev, nvecs);
> +
> + affd->set_vectors[HCTX_TYPE_DEFAULT] = dev->io_queues[HCTX_TYPE_DEFAULT];
> + affd->set_vectors[HCTX_TYPE_READ] = dev->io_queues[HCTX_TYPE_READ];
> + affd->nr_sets = HCTX_TYPE_POLL;
> +}
The value of HCTX_TYPE_POLL happens to be 2, but that seems more of a
coincidence right now. Can we hard code 2 just in case the value changes?
On Tue, Feb 12, 2019 at 09:04:39PM +0800, Ming Lei wrote:
> Now NVMe has implemented the .calc_sets callback for caculating each
> set's vectors.
>
> For other cases of multiple irq sets, it isn't a good way to pre-caculate
> each set's vectors before allocating IRQ vectors because NVMe's same issue
> exists too.
s/irq/IRQ/ so it's consistent in the paragraph.
> Document .calc_sets as required explicitly for multiple sets.
>
> Signed-off-by: Ming Lei <[email protected]>
Acked-by: Bjorn Helgaas <[email protected]>
Note minor comments below.
> ---
> drivers/pci/msi.c | 4 ++--
> include/linux/interrupt.h | 3 ++-
> 2 files changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 4c0b47867258..9f91fa713141 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -1039,7 +1039,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
> * If the caller is passing in sets, we can't support a range of
> * vectors. The caller needs to handle that.
Looks like this comment needs to be tweaked, maybe along the lines of:
If the caller requests multiple sets of IRQs where each set requires
different affinity, it must also supply a ->calc_sets() callback to
compute the affinity cpumask for each set.
(I'm not 100% clear on how calc_sets() works, so I might not have
described this exactly right.)
> */
> - if (affd && affd->nr_sets && minvec != maxvec)
> + if (affd && affd->nr_sets > 1 && !affd->calc_sets)
> return -EINVAL;
>
> if (WARN_ON_ONCE(dev->msi_enabled))
> @@ -1097,7 +1097,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
> * If the caller is passing in sets, we can't support a range of
> * supported vectors. The caller needs to handle that.
Similar comment update here?
> */
> - if (affd && affd->nr_sets && minvec != maxvec)
> + if (affd && affd->nr_sets > 1 && !affd->calc_sets)
> return -EINVAL;
>
> if (WARN_ON_ONCE(dev->msix_enabled))
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 7a27f6ba1f2f..a053f7fb0ff1 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -269,7 +269,8 @@ struct irq_affinity_notify {
> * the MSI(-X) vector space
> * @nr_sets: Length of passed in *sets array
> * @set_vectors: Number of affinitized sets
> - * @calc_sets: Callback for caculating set vectors
> + * @calc_sets: Callback for caculating set vectors, required for
> + * multiple irq sets.
> * @priv: Private data of @calc_sets
> */
> struct irq_affinity {
> --
> 2.9.5
>