2018-03-01 21:32:51

by KarimAllah Ahmed

[permalink] [raw]
Subject: [PATCH v3 1/2] PCI/IOV: Store more data about VFs into the SRIOV struct

Store more data about PCI VFs into the SRIOV to avoid reading them from the
config space of all the PCI VFs. This is specially a useful optimization
when bringing up thousands of VFs.

Cc: Bjorn Helgaas <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: KarimAllah Ahmed <[email protected]>
---
v2 -> v3:
* Update changelog
* Move the call to pci_read_vf_config_common a bit later and use standard
pci_read_config*.
* Update whitespace.
* Move the using barsz into its own patch.
* Added a comment about the usage of subsystem vendor id, subsystem id, and
class revision.
* Make sure virtfn->is_virtfn is set before calling into pci_setup_device.

v1 -> v2:
* Rebase on latest + remove dependency on a non-upstream patch.

drivers/pci/iov.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
drivers/pci/pci.h | 5 +++++
drivers/pci/probe.c | 18 ++++++++++++++----
3 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924a..10291a0 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -114,6 +114,36 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
}

+static void pci_read_vf_config_common(struct pci_dev *virtfn)
+{
+ struct pci_dev *physfn = virtfn->physfn;
+
+ BUG_ON(!virtfn->is_virtfn || physfn->is_virtfn);
+
+ /*
+ * Per PCIe r4.0, sec 9.3.4.1.5, the value reported in the VF maybe
+ * different than the value reported in the PF. We assume here that all
+ * VFs would report the same revision ID.
+ */
+ pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
+ &physfn->sriov->class);
+ /*
+ * Per PCIe r4.0, sec 9.3.4.1.13, the field in the PF and the
+ * associated VFs must return the same value.
+ */
+ pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
+ &physfn->sriov->subsystem_vendor);
+ /*
+ * Per PCIe r4.0, sec 9.3.4.1.14, the value reported in the VF maybe
+ * different than the value reported in the PF. We assume here that all
+ * VFs would report the same subsystem ID.
+ */
+ pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+ &physfn->sriov->subsystem_device);
+ pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
+ &physfn->sriov->hdr_type);
+}
+
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
int i;
@@ -134,15 +164,18 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
goto failed0;

virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
+ virtfn->is_virtfn = 1;
+ virtfn->physfn = pci_dev_get(dev);
+ if (id == 0)
+ /* virtfn->{devfn,bus,is_virtfn,physfn} have to be initialized */
+ pci_read_vf_config_common(virtfn);
virtfn->vendor = dev->vendor;
virtfn->device = iov->vf_device;
rc = pci_setup_device(virtfn);
if (rc)
- goto failed0;
+ goto failed1;

virtfn->dev.parent = dev->dev.parent;
- virtfn->physfn = pci_dev_get(dev);
- virtfn->is_virtfn = 1;
virtfn->multifunction = 0;

for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -163,10 +196,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
sprintf(buf, "virtfn%u", id);
rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
if (rc)
- goto failed1;
+ goto failed2;
rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
if (rc)
- goto failed2;
+ goto failed3;

kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);

@@ -174,11 +207,12 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)

return 0;

-failed2:
+failed3:
sysfs_remove_link(&dev->dev.kobj, buf);
+failed2:
+ pci_stop_and_remove_bus_device(virtfn);
failed1:
pci_dev_put(dev);
- pci_stop_and_remove_bus_device(virtfn);
failed0:
virtfn_remove_bus(dev->bus, bus);
failed:
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fcd8191..17e6688 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -271,6 +271,11 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev *dev; /* Lowest numbered PF */
struct pci_dev *self; /* This PF */
+ u8 hdr_type; /* VF header type */
+ u32 class; /* VF device */
+ u16 device; /* VF device */
+ u16 subsystem_vendor; /* VF subsystem vendor */
+ u16 subsystem_device; /* VF subsystem device */
resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */
bool drivers_autoprobe; /* Auto probing of VFs by driver */
};
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ef53774..a96837e 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1454,7 +1454,9 @@ int pci_setup_device(struct pci_dev *dev)
struct pci_bus_region region;
struct resource *res;

- if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
+ if (dev->is_virtfn)
+ hdr_type = dev->physfn->sriov->hdr_type;
+ else if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
return -EIO;

dev->sysdata = dev->bus->sysdata;
@@ -1477,7 +1479,10 @@ int pci_setup_device(struct pci_dev *dev)
dev->bus->number, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));

- pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
+ if (dev->is_virtfn)
+ class = dev->physfn->sriov->class;
+ else
+ pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
dev->revision = class & 0xff;
dev->class = class >> 8; /* upper 3 bytes */

@@ -1517,8 +1522,13 @@ int pci_setup_device(struct pci_dev *dev)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
- pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
- pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
+ if (dev->is_virtfn) {
+ dev->subsystem_vendor = dev->physfn->sriov->subsystem_vendor;
+ dev->subsystem_device = dev->physfn->sriov->subsystem_device;
+ } else {
+ pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
+ pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
+ }

/*
* Do the ugly legacy mode stuff here rather than broken chip
--
2.7.4



2018-03-01 21:33:06

by KarimAllah Ahmed

[permalink] [raw]
Subject: [PATCH v3 2/2] PCI/IOV: Use the cached VF BARs size instead of re-reading them

Use the cached VF BARs size instead of re-reading them from the hardware.
That avoids doing unnecessarily bus transactions which is specially
noticable when you have a PF with a large number of VFs.

Cc: Bjorn Helgaas <[email protected]>
Cc: [email protected]
Cc: [email protected]
Signed-off-by: KarimAllah Ahmed <[email protected]>
---
drivers/pci/probe.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a96837e..aeaa10a 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -180,6 +180,7 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
struct resource *res, unsigned int pos)
{
+ int bar = res - dev->resource;
u32 l = 0, sz = 0, mask;
u64 l64, sz64, mask64;
u16 orig_cmd;
@@ -199,9 +200,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
res->name = pci_name(dev);

pci_read_config_dword(dev, pos, &l);
- pci_write_config_dword(dev, pos, l | mask);
- pci_read_config_dword(dev, pos, &sz);
- pci_write_config_dword(dev, pos, l);
+ if (dev->is_virtfn) {
+ sz = dev->physfn->sriov->barsz[bar] & 0xffffffff;
+ } else {
+ pci_write_config_dword(dev, pos, l | mask);
+ pci_read_config_dword(dev, pos, &sz);
+ pci_write_config_dword(dev, pos, l);
+ }

/*
* All bits set in sz means the device isn't working properly.
@@ -241,9 +246,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,

if (res->flags & IORESOURCE_MEM_64) {
pci_read_config_dword(dev, pos + 4, &l);
- pci_write_config_dword(dev, pos + 4, ~0);
- pci_read_config_dword(dev, pos + 4, &sz);
- pci_write_config_dword(dev, pos + 4, l);
+
+ if (dev->is_virtfn) {
+ sz = (dev->physfn->sriov->barsz[bar] >> 32) & 0xffffffff;
+ } else {
+ pci_write_config_dword(dev, pos + 4, ~0);
+ pci_read_config_dword(dev, pos + 4, &sz);
+ pci_write_config_dword(dev, pos + 4, l);
+ }

l64 |= ((u64)l << 32);
sz64 |= ((u64)sz << 32);
@@ -332,6 +342,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos];
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
+ if (dev->is_virtfn && dev->physfn->sriov->barsz[pos] == 0)
+ continue;
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
}

--
2.7.4


2018-03-02 22:05:43

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] PCI/IOV: Store more data about VFs into the SRIOV struct

On Thu, Mar 01, 2018 at 10:31:36PM +0100, KarimAllah Ahmed wrote:
> Store more data about PCI VFs into the SRIOV to avoid reading them from the
> config space of all the PCI VFs. This is specially a useful optimization
> when bringing up thousands of VFs.
>
> Cc: Bjorn Helgaas <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: KarimAllah Ahmed <[email protected]>

Applied to pci/virtualization for v4.17, thanks!

I removed the pci_sriov.device field, which seemed to be unused, and
tweaked a few other things, so make sure I didn't break anything. Here's
what I have currently applied:

commit e17b7b429b095200f93ad37c4efeb7a99b6fce3b
Author: KarimAllah Ahmed <[email protected]>
Date: Thu Mar 1 22:31:36 2018 +0100

PCI/IOV: Use VF0 cached config registers for other VFs

Cache some config data from VF0 and use it for all other VFs instead of
reading it from the config space of each VF. We assume these items are the
same across all associated VFs:

Revision ID
Class Code
Subsystem Vendor ID
Subsystem ID

This is an optimization when enabling SR-IOV on a device with many VFs.

Signed-off-by: KarimAllah Ahmed <[email protected]>
[bhelgaas: changelog, simplify comments, remove unused "device"]
Signed-off-by: Bjorn Helgaas <[email protected]>

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 677924ae0350..30bf8f706ed9 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -114,6 +114,29 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
}

+static void pci_read_vf_config_common(struct pci_dev *virtfn)
+{
+ struct pci_dev *physfn = virtfn->physfn;
+
+ /*
+ * Some config registers are the same across all associated VFs.
+ * Read them once from VF0 so we can skip reading them from the
+ * other VFs.
+ *
+ * PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to
+ * have the same Revision ID and Subsystem ID, but we assume they
+ * do.
+ */
+ pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
+ &physfn->sriov->class);
+ pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
+ &physfn->sriov->hdr_type);
+ pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
+ &physfn->sriov->subsystem_vendor);
+ pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
+ &physfn->sriov->subsystem_device);
+}
+
int pci_iov_add_virtfn(struct pci_dev *dev, int id)
{
int i;
@@ -136,13 +159,17 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
virtfn->vendor = dev->vendor;
virtfn->device = iov->vf_device;
+ virtfn->is_virtfn = 1;
+ virtfn->physfn = pci_dev_get(dev);
+
+ if (id == 0)
+ pci_read_vf_config_common(virtfn);
+
rc = pci_setup_device(virtfn);
if (rc)
- goto failed0;
+ goto failed1;

virtfn->dev.parent = dev->dev.parent;
- virtfn->physfn = pci_dev_get(dev);
- virtfn->is_virtfn = 1;
virtfn->multifunction = 0;

for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
@@ -163,10 +190,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
sprintf(buf, "virtfn%u", id);
rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
if (rc)
- goto failed1;
+ goto failed2;
rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
if (rc)
- goto failed2;
+ goto failed3;

kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);

@@ -174,11 +201,12 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)

return 0;

-failed2:
+failed3:
sysfs_remove_link(&dev->dev.kobj, buf);
+failed2:
+ pci_stop_and_remove_bus_device(virtfn);
failed1:
pci_dev_put(dev);
- pci_stop_and_remove_bus_device(virtfn);
failed0:
virtfn_remove_bus(dev->bus, bus);
failed:
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fcd81911b127..db76933be859 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -271,6 +271,10 @@ struct pci_sriov {
u16 driver_max_VFs; /* Max num VFs driver supports */
struct pci_dev *dev; /* Lowest numbered PF */
struct pci_dev *self; /* This PF */
+ u32 class; /* VF class */
+ u8 hdr_type; /* VF header type */
+ u16 subsystem_vendor; /* VF subsystem vendor */
+ u16 subsystem_device; /* VF subsystem device */
resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */
bool drivers_autoprobe; /* Auto probing of VFs by driver */
};
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index a1cddca37793..78deb950bda1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1461,7 +1461,9 @@ int pci_setup_device(struct pci_dev *dev)
struct pci_bus_region region;
struct resource *res;

- if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
+ if (dev->is_virtfn)
+ hdr_type = dev->physfn->sriov->hdr_type;
+ else if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
return -EIO;

dev->sysdata = dev->bus->sysdata;
@@ -1484,7 +1486,10 @@ int pci_setup_device(struct pci_dev *dev)
dev->bus->number, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));

- pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
+ if (dev->is_virtfn)
+ class = dev->physfn->sriov->class;
+ else
+ pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
dev->revision = class & 0xff;
dev->class = class >> 8; /* upper 3 bytes */

@@ -1524,8 +1529,13 @@ int pci_setup_device(struct pci_dev *dev)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
- pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
- pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
+ if (dev->is_virtfn) {
+ dev->subsystem_vendor = dev->physfn->sriov->subsystem_vendor;
+ dev->subsystem_device = dev->physfn->sriov->subsystem_device;
+ } else {
+ pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
+ pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
+ }

/*
* Do the ugly legacy mode stuff here rather than broken chip

2018-03-02 22:07:22

by Bjorn Helgaas

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] PCI/IOV: Use the cached VF BARs size instead of re-reading them

On Thu, Mar 01, 2018 at 10:31:37PM +0100, KarimAllah Ahmed wrote:
> Use the cached VF BARs size instead of re-reading them from the hardware.
> That avoids doing unnecessarily bus transactions which is specially
> noticable when you have a PF with a large number of VFs.

Thanks a lot for breaking this out! It seems trivial, but it did make it
much easier for me to think about this one.

> Cc: Bjorn Helgaas <[email protected]>
> Cc: [email protected]
> Cc: [email protected]
> Signed-off-by: KarimAllah Ahmed <[email protected]>
> ---
> drivers/pci/probe.c | 24 ++++++++++++++++++------
> 1 file changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index a96837e..aeaa10a 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -180,6 +180,7 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
> int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> struct resource *res, unsigned int pos)
> {
> + int bar = res - dev->resource;
> u32 l = 0, sz = 0, mask;
> u64 l64, sz64, mask64;
> u16 orig_cmd;
> @@ -199,9 +200,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> res->name = pci_name(dev);
>
> pci_read_config_dword(dev, pos, &l);
> - pci_write_config_dword(dev, pos, l | mask);
> - pci_read_config_dword(dev, pos, &sz);
> - pci_write_config_dword(dev, pos, l);
> + if (dev->is_virtfn) {
> + sz = dev->physfn->sriov->barsz[bar] & 0xffffffff;
> + } else {
> + pci_write_config_dword(dev, pos, l | mask);
> + pci_read_config_dword(dev, pos, &sz);
> + pci_write_config_dword(dev, pos, l);
> + }

I don't quite understand this. This is reading the regular BARs (config
offsets 0x10, 0x14, ..., 0x24). Per sec 9.3.4.1.11, these are all RO Zero
for VFs. That should make them look like they're all unimplemented.

But this patch makes us use the size we discovered from the PF's VF BARn
registers in its SR-IOV capability. Won't that cause us to fill in the
VF's dev->resource[n], when we didn't do it before?

> /*
> * All bits set in sz means the device isn't working properly.
> @@ -241,9 +246,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
>
> if (res->flags & IORESOURCE_MEM_64) {
> pci_read_config_dword(dev, pos + 4, &l);
> - pci_write_config_dword(dev, pos + 4, ~0);
> - pci_read_config_dword(dev, pos + 4, &sz);
> - pci_write_config_dword(dev, pos + 4, l);
> +
> + if (dev->is_virtfn) {
> + sz = (dev->physfn->sriov->barsz[bar] >> 32) & 0xffffffff;
> + } else {
> + pci_write_config_dword(dev, pos + 4, ~0);
> + pci_read_config_dword(dev, pos + 4, &sz);
> + pci_write_config_dword(dev, pos + 4, l);
> + }
>
> l64 |= ((u64)l << 32);
> sz64 |= ((u64)sz << 32);
> @@ -332,6 +342,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> for (pos = 0; pos < howmany; pos++) {
> struct resource *res = &dev->resource[pos];
> reg = PCI_BASE_ADDRESS_0 + (pos << 2);
> + if (dev->is_virtfn && dev->physfn->sriov->barsz[pos] == 0)
> + continue;

Since we know the VF BARs are all zero (the ones in the VF config space,
not the ones in the PF SR-IOV capability), including the VF ROM BAR, it
would make sense to me to totally skip this whole function, e.g.,

if (dev->non_compliant_bars)
return;

if (dev->is_virtfn)
return;

> pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
> }
>
> --
> 2.7.4
>

2018-03-03 04:35:37

by KarimAllah Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3 2/2] PCI/IOV: Use the cached VF BARs size instead of re-reading them

On Fri, 2018-03-02 at 15:48 -0600, Bjorn Helgaas wrote:
> On Thu, Mar 01, 2018 at 10:31:37PM +0100, KarimAllah Ahmed wrote:
> >
> > Use the cached VF BARs size instead of re-reading them from the hardware.
> > That avoids doing unnecessarily bus transactions which is specially
> > noticable when you have a PF with a large number of VFs.
>
> Thanks a lot for breaking this out! It seems trivial, but it did make it
> much easier for me to think about this one.
>
> >
> > Cc: Bjorn Helgaas <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Signed-off-by: KarimAllah Ahmed <[email protected]>
> > ---
> > drivers/pci/probe.c | 24 ++++++++++++++++++------
> > 1 file changed, 18 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> > index a96837e..aeaa10a 100644
> > --- a/drivers/pci/probe.c
> > +++ b/drivers/pci/probe.c
> > @@ -180,6 +180,7 @@ static inline unsigned long decode_bar(struct pci_dev *dev, u32 bar)
> > int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> > struct resource *res, unsigned int pos)
> > {
> > + int bar = res - dev->resource;
> > u32 l = 0, sz = 0, mask;
> > u64 l64, sz64, mask64;
> > u16 orig_cmd;
> > @@ -199,9 +200,13 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> > res->name = pci_name(dev);
> >
> > pci_read_config_dword(dev, pos, &l);
> > - pci_write_config_dword(dev, pos, l | mask);
> > - pci_read_config_dword(dev, pos, &sz);
> > - pci_write_config_dword(dev, pos, l);
> > + if (dev->is_virtfn) {
> > + sz = dev->physfn->sriov->barsz[bar] & 0xffffffff;
> > + } else {
> > + pci_write_config_dword(dev, pos, l | mask);
> > + pci_read_config_dword(dev, pos, &sz);
> > + pci_write_config_dword(dev, pos, l);
> > + }
>
> I don't quite understand this. This is reading the regular BARs (config
> offsets 0x10, 0x14, ..., 0x24). Per sec 9.3.4.1.11, these are all RO Zero
> for VFs. That should make them look like they're all unimplemented.
>
> But this patch makes us use the size we discovered from the PF's VF BARn
> registers in its SR-IOV capability. Won't that cause us to fill in the
> VF's dev->resource[n], when we didn't do it before?

Oh .. that is correct! I did not notice this part from the spec :)

>
> >
> > /*
> > * All bits set in sz means the device isn't working properly.
> > @@ -241,9 +246,14 @@ int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
> >
> > if (res->flags & IORESOURCE_MEM_64) {
> > pci_read_config_dword(dev, pos + 4, &l);
> > - pci_write_config_dword(dev, pos + 4, ~0);
> > - pci_read_config_dword(dev, pos + 4, &sz);
> > - pci_write_config_dword(dev, pos + 4, l);
> > +
> > + if (dev->is_virtfn) {
> > + sz = (dev->physfn->sriov->barsz[bar] >> 32) & 0xffffffff;
> > + } else {
> > + pci_write_config_dword(dev, pos + 4, ~0);
> > + pci_read_config_dword(dev, pos + 4, &sz);
> > + pci_write_config_dword(dev, pos + 4, l);
> > + }
> >
> > l64 |= ((u64)l << 32);
> > sz64 |= ((u64)sz << 32);
> > @@ -332,6 +342,8 @@ static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
> > for (pos = 0; pos < howmany; pos++) {
> > struct resource *res = &dev->resource[pos];
> > reg = PCI_BASE_ADDRESS_0 + (pos << 2);
> > + if (dev->is_virtfn && dev->physfn->sriov->barsz[pos] == 0)
> > + continue;
>
> Since we know the VF BARs are all zero (the ones in the VF config space,
> not the ones in the PF SR-IOV capability), including the VF ROM BAR, it
> would make sense to me to totally skip this whole function, e.g.,
>
> if (dev->non_compliant_bars)
> return;
>
> if (dev->is_virtfn)
> return;
>

Correct! Done.

> >
> > pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
> > }
> >
> > --
> > 2.7.4
> >
>
Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B

2018-03-06 10:38:18

by KarimAllah Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3 1/2] PCI/IOV: Store more data about VFs into the SRIOV struct

On Fri, 2018-03-02 at 15:36 -0600, Bjorn Helgaas wrote:
> On Thu, Mar 01, 2018 at 10:31:36PM +0100, KarimAllah Ahmed wrote:
> >
> > Store more data about PCI VFs into the SRIOV to avoid reading them from the
> > config space of all the PCI VFs. This is specially a useful optimization
> > when bringing up thousands of VFs.
> >
> > Cc: Bjorn Helgaas <[email protected]>
> > Cc: [email protected]
> > Cc: [email protected]
> > Signed-off-by: KarimAllah Ahmed <[email protected]>
>
> Applied to pci/virtualization for v4.17, thanks!
>
> I removed the pci_sriov.device field, which seemed to be unused, and
> tweaked a few other things, so make sure I didn't break anything.

Yup, still looks good (and works) for me. Thanks.

> Here's what I have currently applied:
>
> commit e17b7b429b095200f93ad37c4efeb7a99b6fce3b
> Author: KarimAllah Ahmed <[email protected]>
> Date: Thu Mar 1 22:31:36 2018 +0100
>
> PCI/IOV: Use VF0 cached config registers for other VFs
>
> Cache some config data from VF0 and use it for all other VFs instead of
> reading it from the config space of each VF. We assume these items are the
> same across all associated VFs:
>
> Revision ID
> Class Code
> Subsystem Vendor ID
> Subsystem ID
>
> This is an optimization when enabling SR-IOV on a device with many VFs.
>
> Signed-off-by: KarimAllah Ahmed <[email protected]>
> [bhelgaas: changelog, simplify comments, remove unused "device"]
> Signed-off-by: Bjorn Helgaas <[email protected]>
>
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 677924ae0350..30bf8f706ed9 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -114,6 +114,29 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
> return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
> }
>
> +static void pci_read_vf_config_common(struct pci_dev *virtfn)
> +{
> + struct pci_dev *physfn = virtfn->physfn;
> +
> + /*
> + * Some config registers are the same across all associated VFs.
> + * Read them once from VF0 so we can skip reading them from the
> + * other VFs.
> + *
> + * PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to
> + * have the same Revision ID and Subsystem ID, but we assume they
> + * do.
> + */
> + pci_read_config_dword(virtfn, PCI_CLASS_REVISION,
> + &physfn->sriov->class);
> + pci_read_config_byte(virtfn, PCI_HEADER_TYPE,
> + &physfn->sriov->hdr_type);
> + pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID,
> + &physfn->sriov->subsystem_vendor);
> + pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID,
> + &physfn->sriov->subsystem_device);
> +}
> +
> int pci_iov_add_virtfn(struct pci_dev *dev, int id)
> {
> int i;
> @@ -136,13 +159,17 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
> virtfn->devfn = pci_iov_virtfn_devfn(dev, id);
> virtfn->vendor = dev->vendor;
> virtfn->device = iov->vf_device;
> + virtfn->is_virtfn = 1;
> + virtfn->physfn = pci_dev_get(dev);
> +
> + if (id == 0)
> + pci_read_vf_config_common(virtfn);
> +
> rc = pci_setup_device(virtfn);
> if (rc)
> - goto failed0;
> + goto failed1;
>
> virtfn->dev.parent = dev->dev.parent;
> - virtfn->physfn = pci_dev_get(dev);
> - virtfn->is_virtfn = 1;
> virtfn->multifunction = 0;
>
> for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
> @@ -163,10 +190,10 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
> sprintf(buf, "virtfn%u", id);
> rc = sysfs_create_link(&dev->dev.kobj, &virtfn->dev.kobj, buf);
> if (rc)
> - goto failed1;
> + goto failed2;
> rc = sysfs_create_link(&virtfn->dev.kobj, &dev->dev.kobj, "physfn");
> if (rc)
> - goto failed2;
> + goto failed3;
>
> kobject_uevent(&virtfn->dev.kobj, KOBJ_CHANGE);
>
> @@ -174,11 +201,12 @@ int pci_iov_add_virtfn(struct pci_dev *dev, int id)
>
> return 0;
>
> -failed2:
> +failed3:
> sysfs_remove_link(&dev->dev.kobj, buf);
> +failed2:
> + pci_stop_and_remove_bus_device(virtfn);
> failed1:
> pci_dev_put(dev);
> - pci_stop_and_remove_bus_device(virtfn);
> failed0:
> virtfn_remove_bus(dev->bus, bus);
> failed:
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index fcd81911b127..db76933be859 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -271,6 +271,10 @@ struct pci_sriov {
> u16 driver_max_VFs; /* Max num VFs driver supports */
> struct pci_dev *dev; /* Lowest numbered PF */
> struct pci_dev *self; /* This PF */
> + u32 class; /* VF class */
> + u8 hdr_type; /* VF header type */
> + u16 subsystem_vendor; /* VF subsystem vendor */
> + u16 subsystem_device; /* VF subsystem device */
> resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */
> bool drivers_autoprobe; /* Auto probing of VFs by driver */
> };
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index a1cddca37793..78deb950bda1 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -1461,7 +1461,9 @@ int pci_setup_device(struct pci_dev *dev)
> struct pci_bus_region region;
> struct resource *res;
>
> - if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
> + if (dev->is_virtfn)
> + hdr_type = dev->physfn->sriov->hdr_type;
> + else if (pci_read_config_byte(dev, PCI_HEADER_TYPE, &hdr_type))
> return -EIO;
>
> dev->sysdata = dev->bus->sysdata;
> @@ -1484,7 +1486,10 @@ int pci_setup_device(struct pci_dev *dev)
> dev->bus->number, PCI_SLOT(dev->devfn),
> PCI_FUNC(dev->devfn));
>
> - pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
> + if (dev->is_virtfn)
> + class = dev->physfn->sriov->class;
> + else
> + pci_read_config_dword(dev, PCI_CLASS_REVISION, &class);
> dev->revision = class & 0xff;
> dev->class = class >> 8; /* upper 3 bytes */
>
> @@ -1524,8 +1529,13 @@ int pci_setup_device(struct pci_dev *dev)
> goto bad;
> pci_read_irq(dev);
> pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
> - pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
> - pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
> + if (dev->is_virtfn) {
> + dev->subsystem_vendor = dev->physfn->sriov->subsystem_vendor;
> + dev->subsystem_device = dev->physfn->sriov->subsystem_device;
> + } else {
> + pci_read_config_word(dev, PCI_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
> + pci_read_config_word(dev, PCI_SUBSYSTEM_ID, &dev->subsystem_device);
> + }
>
> /*
> * Do the ugly legacy mode stuff here rather than broken chip
>
Amazon Development Center Germany GmbH
Berlin - Dresden - Aachen
main office: Krausenstr. 38, 10117 Berlin
Geschaeftsfuehrer: Dr. Ralf Herbrich, Christian Schlaeger
Ust-ID: DE289237879
Eingetragen am Amtsgericht Charlottenburg HRB 149173 B