On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> This first part simply changes the msi_attrib data structure to store
> how many vectors have been allocated. In order to do this, I shrink the
> 'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
> users.

Please don't, it significantly uglifies the code IMHO. Just add a new
field for the size, I'd rather call it qsize to match the register.

If you're worried about bloating msi_desc, there's several fields in
there that are per-device not per-desc, so we could do another patch to
move them into pci_dev or something hanging off it, eg.
pci_dev->msi_info rather than storing them in every desc.

cheers

> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 8c61304..92992a8 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -106,11 +106,11 @@ static void msix_flush_writes(unsigned int irq)
>
> entry = get_irq_msi(irq);
> BUG_ON(!entry || !entry->dev);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> /* nothing to do */
> break;
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
> PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> @@ -129,8 +129,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
>
> entry = get_irq_msi(irq);
> BUG_ON(!entry || !entry->dev);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> if (entry->msi_attrib.maskbit) {
> int pos;
> u32 mask_bits;
> @@ -144,7 +144,7 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
> msi_set_enable(entry->dev, !flag);
> }
> break;
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> int offset = entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
> PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET;
> @@ -162,8 +162,8 @@ static void msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
> void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> {
> struct msi_desc *entry = get_irq_msi(irq);
> - switch(entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch(entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> {
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> @@ -182,7 +182,7 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> msg->data = data;
> break;
> }
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> void __iomem *base;
> base = entry->mask_base +
> @@ -201,11 +201,17 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
> void write_msi_msg(unsigned int irq, struct msi_msg *msg)
> {
> struct msi_desc *entry = get_irq_msi(irq);
> - switch (entry->msi_attrib.type) {
> - case PCI_CAP_ID_MSI:
> + switch (entry->msi_attrib._type) {
> + case MSI_ATTRIB:
> {
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> + u16 msgctl;
> +
> + pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
> + msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> + msgctl |= entry->msi_attrib.multiple << 4;
> + pci_write_config_word(dev, msi_control_reg(pos), msgctl);
>
> pci_write_config_dword(dev, msi_lower_address_reg(pos),
> msg->address_lo);
> @@ -220,7 +226,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
> }
> break;
> }
> - case PCI_CAP_ID_MSIX:
> + case MSIX_ATTRIB:
> {
> void __iomem *base;
> base = entry->mask_base +
> @@ -359,7 +365,7 @@ static int msi_capability_init(struct pci_dev *dev)
> if (!entry)
> return -ENOMEM;
>
> - entry->msi_attrib.type = PCI_CAP_ID_MSI;
> + entry->msi_attrib._type = MSI_ATTRIB;
> entry->msi_attrib.is_64 = is_64bit_address(control);
> entry->msi_attrib.entry_nr = 0;
> entry->msi_attrib.maskbit = is_mask_bit_support(control);
> @@ -446,7 +452,7 @@ static int msix_capability_init(struct pci_dev *dev,
> break;
>
> j = entries[i].entry;
> - entry->msi_attrib.type = PCI_CAP_ID_MSIX;
> + entry->msi_attrib._type = MSIX_ATTRIB;
> entry->msi_attrib.is_64 = 1;
> entry->msi_attrib.entry_nr = j;
> entry->msi_attrib.maskbit = 1;
> @@ -589,12 +595,13 @@ void pci_msi_shutdown(struct pci_dev* dev)
> u32 mask = entry->msi_attrib.maskbits_mask;
> msi_set_mask_bits(dev->irq, mask, ~mask);
> }
> - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> + if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
> return;
>
> /* Restore dev->irq to its default pin-assertion irq */
> dev->irq = entry->msi_attrib.default_irq;
> }
> +
> void pci_disable_msi(struct pci_dev* dev)
> {
> struct msi_desc *entry;
> @@ -605,7 +612,7 @@ void pci_disable_msi(struct pci_dev* dev)
> pci_msi_shutdown(dev);
>
> entry = list_entry(dev->msi_list.next, struct msi_desc, list);
> - if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
> + if (!entry->dev || entry->msi_attrib._type != MSI_ATTRIB)
> return;
>
> msi_free_irqs(dev);
> @@ -624,7 +631,7 @@ static int msi_free_irqs(struct pci_dev* dev)
> arch_teardown_msi_irqs(dev);
>
> list_for_each_entry_safe(entry, tmp, &dev->msi_list, list) {
> - if (entry->msi_attrib.type == PCI_CAP_ID_MSIX) {
> + if (entry->msi_attrib._type == MSIX_ATTRIB) {
> writel(1, entry->mask_base + entry->msi_attrib.entry_nr
> * PCI_MSIX_ENTRY_SIZE
> + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
> diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
> index 3898f52..b72e0bd 100644
> --- a/drivers/pci/msi.h
> +++ b/drivers/pci/msi.h
> @@ -22,12 +22,8 @@
> #define msi_disable(control) control &= ~PCI_MSI_FLAGS_ENABLE
> #define multi_msi_capable(control) \
> (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1))
> -#define multi_msi_enable(control, num) \
> - control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE);
> #define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT))
> #define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT))
> -#define msi_enable(control, num) multi_msi_enable(control, num); \
> - control |= PCI_MSI_FLAGS_ENABLE
>
> #define msix_table_offset_reg(base) (base + 0x04)
> #define msix_pba_offset_reg(base) (base + 0x08)
> diff --git a/include/linux/msi.h b/include/linux/msi.h
> index 8f29392..d322148 100644
> --- a/include/linux/msi.h
> +++ b/include/linux/msi.h
> @@ -15,9 +15,13 @@ extern void unmask_msi_irq(unsigned int irq);
> extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
> extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
>
> +#define MSI_ATTRIB 1
> +#define MSIX_ATTRIB 2
> +
> struct msi_desc {
> struct {
> - __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */
> + __u8 _type : 2; /* {0: unused, 1:MSI, 2:MSI-X} */
> + __u8 multiple: 3; /* log2 number of messages */
> __u8 maskbit : 1; /* mask-pending bit supported ? */
> __u8 masked : 1;
> __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */
--
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2008-07-07 02:05:51

On Sun, 2008-07-06 at 20:41 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 12:05:24PM +1000, Michael Ellerman wrote:
> > On Sat, 2008-07-05 at 09:34 -0400, Matthew Wilcox wrote:
> > > This first part simply changes the msi_attrib data structure to store
> > > how many vectors have been allocated. In order to do this, I shrink the
> > > 'type' from 5 bits to 2 and rename it to _type to catch any unsuspecting
> > > users.
> >
> > Please don't, it significantly uglifies the code IMHO. Just add a new
> > field for the size, I'd rather call it qsize to match the register.
>
> Uglifies the code? Seriously? Other than the _ addition (which really
> I just did to be sure I didn't miss a case), how is MSI_ATTRIB uglier
> than PCI_CAP_ID_MSI?

Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
is well defined and is used in the rest of the code.

> I'd like to rename the register definition from QSIZE. It's _not_ a
> queue. I don't know where this misunderstanding came from, but I
> certainly don't want to spread it any further.

I didn't say it was a queue, but a Q ;) But I agree it's not a good
name, the spec calls it "multiple message enable", nvec would match the
existing code best, or log_nvec.

> > If you're worried about bloating msi_desc, there's several fields in
> > there that are per-device not per-desc, so we could do another patch to
> > move them into pci_dev or something hanging off it, eg.
> > pci_dev->msi_info rather than storing them in every desc.
>
> Might be worth it anyway for devices with lots of MSI-X interrupts.

Eventually yeah, last I looked we didn't have any drivers using more
than a few MSI-X, but at some point it will happen.

> I think the MSI-X implementation is a bit poorly written anyway. If we
> had an array of msi_desc for each device, we could avoid the list_head
> in the msi_desc, for example. That'd save two pointers (8 or 16 bytes),
> plus the overhead of allocating each one individually.

Yeah that would be nice.

> I also think that MSI-X could be improved by changing the interface to
> do away with this msix_entry list passed in -- just allocate the irqs
> consecutively.

It would be nice, but as I said the other day we have at least one
driver (s2io) which asks for non-consecutive entries. That doesn't
effect the irq allocation, but you need some way for the driver to
express it.

cheers

--
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2008-07-07 03:57:05

On Mon, 2008-07-07 at 10:17 -0600, Grant Grundler wrote:
> On Thu, Jul 03, 2008 at 01:24:29PM +1000, Benjamin Herrenschmidt wrote:
> ...
> > > Next, MSI requires that you assign a block of interrupts that is a power
> > > of two in size (between 2^0 and 2^5), and aligned to at least that power
> > > of two.
> ...
> > > One thing I do want to be clear in the API is that the driver can ask
> > > for any number of irqs, the pci layer will round up to the next power of
> > > two if necessary.
> >
> > Well, that's where I'm not happy. The API shouldn't expose the
> > "power-of-two" thing. The numbers shown to drivers aren't in the same
> > space as the source numbers as seen by the HW on many architectures and
> > thus don't need to have the same constraints.
>
> The drivers have to deal with the limitations of the HW spec.
> In this case it means they have to know they are getting power of 2
> number of interrupts. I think exposing this in the API is a requirement
> and not optional.

I don't think it's quite that strong. If a driver asked for 6 interrupts
the MSI code could setup 8 and have 2 just hooked to nothing. I'm not
sure that's a good idea, but it's possible.

cheers

--
Michael Ellerman
OzLabs, IBM Australia Development Lab

email: [email protected]
stime: [email protected]
notes: Michael Ellerman/Australia/IBM
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

2008-07-10 01:32:56

by Michael Ellerman

[permalink] [raw]

Subject: Re: [PATCH 1/4] PCI MSI: Store the number of messages in the msi_desc

On Mon, 2008-07-07 at 06:04 -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 01:48:32PM +1000, Michael Ellerman wrote:
> > Yeah seriously :) The _ is part of it, but MSI_ATTRIB is uglier than
> > PCI_CAP_ID_MSI exactly because it's not PCI_CAP_ID_MSI, which exists and
> > is well defined and is used in the rest of the code.
>
> Here's an improvement over both the status quo and my patch -- simply
> use a single bit called is_msix.

That is cleaner, you get to fix it when they create MSIXX though ;)

> > I didn't say it was a queue, but a Q ;) But I agree it's not a good
> > name, the spec calls it "multiple message enable", nvec would match the
> > existing code best, or log_nvec.
>
> I don't see what's wrong with 'multiple'. log_nvec is clunky, and
> 'multiple' works well as a boolean (since 0 means 1 interrupt).

For me 'multiple' only makes sense as a boolean, but whatever.

> > > > If you're worried about bloating msi_desc, there's several fields in
> > > > there that are per-device not per-desc, so we could do another patch to
> > > > move them into pci_dev or something hanging off it, eg.
> > > > pci_dev->msi_info rather than storing them in every desc.
>
> Ouch. I just used pahole and discovered we were using 72 bytes on
> 64-bit. A swift rearrangement of a u16 gets us back down to 64.

pahole is awesome, nice find.

> Here's the replacement patch:

Perhaps I'm pedantic, but I'd rather it was two patches, one to change
type to is_msix and one to add the multiple flag.

> diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
> index 8c61304..8f7e483 100644
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c

> @@ -180,32 +167,31 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
...
> struct pci_dev *dev = entry->dev;
> int pos = entry->msi_attrib.pos;
> + u16 msgctl;
> +
> + pci_read_config_word(dev, msi_control_reg(pos), &msgctl);
> + msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> + msgctl |= entry->msi_attrib.multiple << 4;
> + pci_write_config_word(dev, msi_control_reg(pos), msgctl);

A #define for "<< 4" would be nice. And should we be paranoid about
potentially writing 0b110 or 0b111 which are reserved?

cheers

--
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2008-07-10 01:33:17

On Wed, 2008-07-09 at 19:43 -0600, Matthew Wilcox wrote:
> On Thu, Jul 10, 2008 at 11:32:44AM +1000, Michael Ellerman wrote:
> > > int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > > {
> > > + if (type == PCI_CAP_ID_MSI && nvec > 1)
> > > + return 1;
> >
> > This should go in arch_msi_check_device(). We might move it into a
> > ppc_md routine eventually.
>
> I'm OK with that, but ...
>
> > > int __attribute__ ((weak))
> > > arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> > > {
> > > - struct msi_desc *entry;
> > > + struct msi_desc *desc;
> > > int ret;
> > >
> > > - list_for_each_entry(entry, &dev->msi_list, list) {
> > > - ret = arch_setup_msi_irq(dev, entry);
> > > + if ((type == PCI_CAP_ID_MSI) && (nvec > 1))
> > > + return 1;
> >
> > I think the check should be in the generic arch_msi_check_device(), so
> > archs can override just the check.
>
> ... then x86 has to implement arch_msi_check_device in order to _not_
> perform the check, which feels a bit bass-ackwards to me.

Agreed, but I think that's still better. You might have alignment
constraints or whatever you need to check as well.

> > >
> > > void __attribute__ ((weak))
> > > -arch_teardown_msi_irqs(struct pci_dev *dev)
> > > +arch_teardown_msi_irqs(struct pci_dev *dev, int nvec)
> > > {
> > > struct msi_desc *entry;
> > >
> > > list_for_each_entry(entry, &dev->msi_list, list) {
> > > - if (entry->irq != 0)
> > > - arch_teardown_msi_irq(entry->irq);
> > > + int i;
> > > + if (entry->irq == 0)
> > > + continue;
> > > + for (i = 0; i < nvec; i++)
> > > + arch_teardown_msi_irq(entry->irq + i);
> >
> > This looks wrong. You're looping through all MSIs for the device, and
> > then for each one you're looping through all MSIs for the device. And
> > you're assuming they're contiguous, which they won't be for MSI-X.
> >
> > AFAICS this code should work for you as it was.
>
> For MSI-X, nvec will be = 1. Maybe I should call it something else to
> avoid confusion. The code won't work for me as-was because it won't
> call arch_teardown_msi_irq() for all entries.

It will call arch_teardown_msi_irq() for all entries, unless they were
never allocated (entry->irq == 0). Or are we talking about different
things?

If you mean that you're allocating more irqs than there are entries then
you need to deal with that in arch_teardown_msi_irqs().

> > > @@ -737,6 +737,8 @@ extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
> > > extern void pci_restore_msi_state(struct pci_dev *dev);
> > > #endif
> > >
> > > +#define pci_enable_msi(pdev) pci_enable_msi_block(pdev, 1)
> >
> > Someone will probably say this should be a static inline.
>
> Not quite sure why. You don't get any better typechecking by making it
> a static inline.

Yeah I agree, just pointing it out.

cheers

--
Michael Ellerman
OzLabs, IBM Australia Development Lab

wwweb: http://michael.ellerman.id.au
phone: +61 2 6212 1183 (tie line 70 21183)

We do not inherit the earth from our ancestors,
we borrow it from our children. - S.M.A.R.T Person

Attachments:

signature.asc (189.00 B)
This is a digitally signed message part

2008-07-20 07:49:29

by Grant Grundler

[permalink] [raw]

Subject: Re: [PATCH 3/4] AHCI: Request multiple MSIs

On Mon, Jul 07, 2008 at 11:48:03AM -0600, Matthew Wilcox wrote:
> On Mon, Jul 07, 2008 at 10:45:34AM -0600, Grant Grundler wrote:
> > If the system is busy, the readl is the cost of coalescing the
> > interrupts. I suspect it's cheaper to take one readl than
> > handle 16 individual interrupts.
>
> 16 would be a maximum imposed by the AHCI spec. My ICH9 board has 6
> ports, but requests all 16 interrupts
>
> > I'm just pointing out the only upside of the existing code and not trying
> > to argue against this patch.
>
> There may well be an upside to the existing code, but it's pretty slim.
> The oprofile shows clearly that ahci_interrupt is the largest consumer of
> time during an iozone run. The only thing that routine does is read the
> HOST_IRQ_STAT register, acquire the spinlock and loop calling
> ahci_port_intr().
>
> I don't have a profile for this new code yet. Hopefully we'll have one
> by the end of the day.

Willy,
where you able to get this profile?
I'm still curious.

>
> > BTW, one more downside of the regular IRQ is it's possibly shared.
> > Using MSI guaratees exclusive IRQ and avoids spurious readl's
> > when AHCI is not busy but the other device is. This would be worth
> > noting (or as a reminder) in the change log or as a comment in
> > the code.
>
> AHCI already allocates itself a new MSI if the machine supports MSI.
> This change merely extends AHCI to use multiple MSIs.

ok.

thanks,
grant

>
> Thanks.
>
> --
> Intel are signing my paycheques ... these opinions are still mine
> "Bill, look, we understand that you're interested in selling us this
> operating system, but compare it to ours. We can't possibly take such
> a retrograde step."