A warning is generated when a PCIe device is probed with a degraded
link, but there was no similar mechanism to warn when the link becomes
degraded after probing. The Link Bandwidth Notification provides this
mechanism.
Use the link bandwidth notification interrupt to detect bandwidth
changes, and rescan the bandwidth, looking for the weakest point. This
is the same logic used in probe().
Signed-off-by: Alexandru Gagniuc <[email protected]>
---
drivers/pci/hotplug/pciehp_hpc.c | 35 +++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 7dd443aea5a5..834672000b59 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -515,7 +515,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
struct controller *ctrl = (struct controller *)dev_id;
struct pci_dev *pdev = ctrl_dev(ctrl);
struct device *parent = pdev->dev.parent;
- u16 status, events;
+ struct pci_dev *endpoint;
+ u16 status, events, link_status;
/*
* Interrupts only occur in D3hot or shallower and only if enabled
@@ -525,6 +526,17 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
(!(ctrl->slot_ctrl & PCI_EXP_SLTCTL_HPIE) && !pciehp_poll_mode))
return IRQ_NONE;
+ pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status);
+
+ if (link_status & PCI_EXP_LNKSTA_LBMS) {
+ if (pdev->subordinate && pdev->subordinate->self)
+ endpoint = pdev->subordinate->self;
+ else
+ endpoint = pdev;
+ __pcie_print_link_status(endpoint, false);
+ pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, link_status);
+ }
+
/*
* Keep the port accessible by holding a runtime PM ref on its parent.
* Defer resume of the parent to the IRQ thread if it's suspended.
@@ -677,6 +689,24 @@ static int pciehp_poll(void *data)
return 0;
}
+static bool pcie_link_bandwidth_notification_supported(struct controller *ctrl)
+{
+ int ret;
+ u32 cap;
+
+ ret = pcie_capability_read_dword(ctrl_dev(ctrl), PCI_EXP_LNKCAP, &cap);
+ return (ret == PCIBIOS_SUCCESSFUL) && (cap & PCI_EXP_LNKCAP_LBNC);
+}
+
+static void pcie_enable_link_bandwidth_notification(struct controller *ctrl)
+{
+ u16 lnk_ctl;
+
+ pcie_capability_read_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, &lnk_ctl);
+ lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
+ pcie_capability_write_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, lnk_ctl);
+}
+
static void pcie_enable_notification(struct controller *ctrl)
{
u16 cmd, mask;
@@ -713,6 +743,9 @@ static void pcie_enable_notification(struct controller *ctrl)
pcie_write_cmd_nowait(ctrl, cmd, mask);
ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd);
+
+ if (pcie_link_bandwidth_notification_supported(ctrl))
+ pcie_enable_link_bandwidth_notification(ctrl);
}
static void pcie_disable_notification(struct controller *ctrl)
--
2.17.1
Hi Alexandru,
On Wed, Nov 28, 2018 at 06:08:24PM -0600, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
>
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().
>
> Signed-off-by: Alexandru Gagniuc <[email protected]>
> ---
> drivers/pci/hotplug/pciehp_hpc.c | 35 +++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
> index 7dd443aea5a5..834672000b59 100644
> --- a/drivers/pci/hotplug/pciehp_hpc.c
> +++ b/drivers/pci/hotplug/pciehp_hpc.c
> @@ -515,7 +515,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
> struct controller *ctrl = (struct controller *)dev_id;
> struct pci_dev *pdev = ctrl_dev(ctrl);
> struct device *parent = pdev->dev.parent;
> - u16 status, events;
> + struct pci_dev *endpoint;
> + u16 status, events, link_status;
Looks better if you write them in opposite order (reverse xmas-tree):
u16 status, events, link_status;
struct pci_dev *endpoint;
> /*
> * Interrupts only occur in D3hot or shallower and only if enabled
> @@ -525,6 +526,17 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
> (!(ctrl->slot_ctrl & PCI_EXP_SLTCTL_HPIE) && !pciehp_poll_mode))
> return IRQ_NONE;
>
> + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status);
> +
Unnecessary empty line.
> + if (link_status & PCI_EXP_LNKSTA_LBMS) {
> + if (pdev->subordinate && pdev->subordinate->self)
> + endpoint = pdev->subordinate->self;
Hmm, I thought pdev->subordinate->self == pdev, no?
> + else
> + endpoint = pdev;
> + __pcie_print_link_status(endpoint, false);
> + pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, link_status);
> + }
> +
> /*
> * Keep the port accessible by holding a runtime PM ref on its parent.
> * Defer resume of the parent to the IRQ thread if it's suspended.
> @@ -677,6 +689,24 @@ static int pciehp_poll(void *data)
> return 0;
> }
>
> +static bool pcie_link_bandwidth_notification_supported(struct controller *ctrl)
> +{
> + int ret;
> + u32 cap;
> +
> + ret = pcie_capability_read_dword(ctrl_dev(ctrl), PCI_EXP_LNKCAP, &cap);
> + return (ret == PCIBIOS_SUCCESSFUL) && (cap & PCI_EXP_LNKCAP_LBNC);
> +}
> +
> +static void pcie_enable_link_bandwidth_notification(struct controller *ctrl)
> +{
> + u16 lnk_ctl;
> +
> + pcie_capability_read_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, &lnk_ctl);
> + lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> + pcie_capability_write_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> static void pcie_enable_notification(struct controller *ctrl)
> {
> u16 cmd, mask;
> @@ -713,6 +743,9 @@ static void pcie_enable_notification(struct controller *ctrl)
> pcie_write_cmd_nowait(ctrl, cmd, mask);
> ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
> pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd);
> +
> + if (pcie_link_bandwidth_notification_supported(ctrl))
> + pcie_enable_link_bandwidth_notification(ctrl);
Do we ever need to disable it?
> }
>
> static void pcie_disable_notification(struct controller *ctrl)
> --
> 2.17.1
On Wed, Nov 28, 2018 at 06:08:24PM -0600, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
>
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().
I like the concept of this. What I don't like is the fact that it's
tied to pciehp, since I don't think the concept of Link Bandwidth
Notification is related to hotplug. So I think we'll only notice this
for ports that support hotplug. Maybe it's worth doing it this way
anyway, even if it could be generalized in the future?
> Signed-off-by: Alexandru Gagniuc <[email protected]>
> ---
> drivers/pci/hotplug/pciehp_hpc.c | 35 +++++++++++++++++++++++++++++++-
> 1 file changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
> index 7dd443aea5a5..834672000b59 100644
> --- a/drivers/pci/hotplug/pciehp_hpc.c
> +++ b/drivers/pci/hotplug/pciehp_hpc.c
> @@ -515,7 +515,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
> struct controller *ctrl = (struct controller *)dev_id;
> struct pci_dev *pdev = ctrl_dev(ctrl);
> struct device *parent = pdev->dev.parent;
> - u16 status, events;
> + struct pci_dev *endpoint;
> + u16 status, events, link_status;
>
> /*
> * Interrupts only occur in D3hot or shallower and only if enabled
> @@ -525,6 +526,17 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
> (!(ctrl->slot_ctrl & PCI_EXP_SLTCTL_HPIE) && !pciehp_poll_mode))
> return IRQ_NONE;
>
> + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status);
> +
> + if (link_status & PCI_EXP_LNKSTA_LBMS) {
> + if (pdev->subordinate && pdev->subordinate->self)
> + endpoint = pdev->subordinate->self;
> + else
> + endpoint = pdev;
> + __pcie_print_link_status(endpoint, false);
> + pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, link_status);
> + }
> +
> /*
> * Keep the port accessible by holding a runtime PM ref on its parent.
> * Defer resume of the parent to the IRQ thread if it's suspended.
> @@ -677,6 +689,24 @@ static int pciehp_poll(void *data)
> return 0;
> }
>
> +static bool pcie_link_bandwidth_notification_supported(struct controller *ctrl)
> +{
> + int ret;
> + u32 cap;
> +
> + ret = pcie_capability_read_dword(ctrl_dev(ctrl), PCI_EXP_LNKCAP, &cap);
> + return (ret == PCIBIOS_SUCCESSFUL) && (cap & PCI_EXP_LNKCAP_LBNC);
> +}
> +
> +static void pcie_enable_link_bandwidth_notification(struct controller *ctrl)
> +{
> + u16 lnk_ctl;
> +
> + pcie_capability_read_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, &lnk_ctl);
> + lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> + pcie_capability_write_word(ctrl_dev(ctrl), PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> static void pcie_enable_notification(struct controller *ctrl)
> {
> u16 cmd, mask;
> @@ -713,6 +743,9 @@ static void pcie_enable_notification(struct controller *ctrl)
> pcie_write_cmd_nowait(ctrl, cmd, mask);
> ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
> pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd);
> +
> + if (pcie_link_bandwidth_notification_supported(ctrl))
> + pcie_enable_link_bandwidth_notification(ctrl);
> }
>
> static void pcie_disable_notification(struct controller *ctrl)
> --
> 2.17.1
>
On 11/29/2018 11:36 AM, Bjorn Helgaas wrote:
> On Wed, Nov 28, 2018 at 06:08:24PM -0600, Alexandru Gagniuc wrote:
>> A warning is generated when a PCIe device is probed with a degraded
>> link, but there was no similar mechanism to warn when the link becomes
>> degraded after probing. The Link Bandwidth Notification provides this
>> mechanism.
>>
>> Use the link bandwidth notification interrupt to detect bandwidth
>> changes, and rescan the bandwidth, looking for the weakest point. This
>> is the same logic used in probe().
>
> I like the concept of this. What I don't like is the fact that it's
> tied to pciehp, since I don't think the concept of Link Bandwidth
> Notification is related to hotplug. So I think we'll only notice this
> for ports that support hotplug. Maybe it's worth doing it this way
> anyway, even if it could be generalized in the future?
That makes sense. At first, I thought that BW notification was tied to
hotplug, but our PCIe spec writer disagreed with that assertion. I'm
just not sure where to handle the interrupt otherwise.
Alex
On 11/29/2018 10:06 AM, Mika Westerberg wrote:
>> @@ -515,7 +515,8 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id)
>> struct controller *ctrl = (struct controller *)dev_id;
>> struct pci_dev *pdev = ctrl_dev(ctrl);
>> struct device *parent = pdev->dev.parent;
>> - u16 status, events;
>> + struct pci_dev *endpoint;
>> + u16 status, events, link_status;
>
> Looks better if you write them in opposite order (reverse xmas-tree):
>
> u16 status, events, link_status;
> struct pci_dev *endpoint;
>
I don't decorate in November :p
>> + pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, &link_status);
>> +
>
> Unnecessary empty line.
However Bjorn wants it, though I don't like the crowded look with this
line removed.
>> + if (link_status & PCI_EXP_LNKSTA_LBMS) {
>> + if (pdev->subordinate && pdev->subordinate->self)
>> + endpoint = pdev->subordinate->self;
>
> Hmm, I thought pdev->subordinate->self == pdev, no?
That makes no sense, but I think you're right. I'm trying to get to the
other end of the PCIe link. Is there a simple way to do that? (other
than convoluted logic that all leads to the same mistake)
>> static void pcie_enable_notification(struct controller *ctrl)
>> {
>> u16 cmd, mask;
>> @@ -713,6 +743,9 @@ static void pcie_enable_notification(struct controller *ctrl)
>> pcie_write_cmd_nowait(ctrl, cmd, mask);
>> ctrl_dbg(ctrl, "%s: SLOTCTRL %x write cmd %x\n", __func__,
>> pci_pcie_cap(ctrl->pcie->port) + PCI_EXP_SLTCTL, cmd);
>> +
>> + if (pcie_link_bandwidth_notification_supported(ctrl))
>> + pcie_enable_link_bandwidth_notification(ctrl);
>
> Do we ever need to disable it?
I can't think of a case where that would be needed.
Alex
On Thu, Nov 29, 2018 at 06:57:37PM +0000, [email protected] wrote:
> On 11/29/2018 11:36 AM, Bjorn Helgaas wrote:
> > On Wed, Nov 28, 2018 at 06:08:24PM -0600, Alexandru Gagniuc wrote:
> >> A warning is generated when a PCIe device is probed with a degraded
> >> link, but there was no similar mechanism to warn when the link becomes
> >> degraded after probing. The Link Bandwidth Notification provides this
> >> mechanism.
> >>
> >> Use the link bandwidth notification interrupt to detect bandwidth
> >> changes, and rescan the bandwidth, looking for the weakest point. This
> >> is the same logic used in probe().
> >
> > I like the concept of this. What I don't like is the fact that it's
> > tied to pciehp, since I don't think the concept of Link Bandwidth
> > Notification is related to hotplug. So I think we'll only notice this
> > for ports that support hotplug. Maybe it's worth doing it this way
> > anyway, even if it could be generalized in the future?
>
> That makes sense. At first, I thought that BW notification was tied to
> hotplug, but our PCIe spec writer disagreed with that assertion. I'm
> just not sure where to handle the interrupt otherwise.
I guess the interrupt is shared with hotplug and PME? In that case write
a separate pcie_port_service_driver and request the interrupt with
IRQF_SHARED. Define a new service type in drivers/pci/pcie/portdrv.h.
Amend get_port_device_capability() to check for PCI_EXP_LNKCAP_LBNC.
Thanks,
Lukas
On Thu, Nov 29, 2018 at 07:00:58PM +0000, [email protected] wrote:
> >> + if (link_status & PCI_EXP_LNKSTA_LBMS) {
> >> + if (pdev->subordinate && pdev->subordinate->self)
> >> + endpoint = pdev->subordinate->self;
> >
> > Hmm, I thought pdev->subordinate->self == pdev, no?
>
> That makes no sense, but I think you're right. I'm trying to get to the
> other end of the PCIe link. Is there a simple way to do that? (other
> than convoluted logic that all leads to the same mistake)
AFAIK you should be able to find the other end by looking at the
pdev->subordinate->devices list. Not sure if there is a simpler way,
though.
On Thu, Nov 29, 2018 at 08:13:12PM +0100, Lukas Wunner wrote:
> On Thu, Nov 29, 2018 at 06:57:37PM +0000, [email protected] wrote:
> > On 11/29/2018 11:36 AM, Bjorn Helgaas wrote:
> > > On Wed, Nov 28, 2018 at 06:08:24PM -0600, Alexandru Gagniuc wrote:
> > >> A warning is generated when a PCIe device is probed with a degraded
> > >> link, but there was no similar mechanism to warn when the link becomes
> > >> degraded after probing. The Link Bandwidth Notification provides this
> > >> mechanism.
> > >>
> > >> Use the link bandwidth notification interrupt to detect bandwidth
> > >> changes, and rescan the bandwidth, looking for the weakest point. This
> > >> is the same logic used in probe().
> > >
> > > I like the concept of this. What I don't like is the fact that it's
> > > tied to pciehp, since I don't think the concept of Link Bandwidth
> > > Notification is related to hotplug. So I think we'll only notice this
> > > for ports that support hotplug. Maybe it's worth doing it this way
> > > anyway, even if it could be generalized in the future?
> >
> > That makes sense. At first, I thought that BW notification was tied to
> > hotplug, but our PCIe spec writer disagreed with that assertion. I'm
> > just not sure where to handle the interrupt otherwise.
>
> I guess the interrupt is shared with hotplug and PME? In that case write
> a separate pcie_port_service_driver and request the interrupt with
> IRQF_SHARED. Define a new service type in drivers/pci/pcie/portdrv.h.
> Amend get_port_device_capability() to check for PCI_EXP_LNKCAP_LBNC.
I really don't like the port driver design. I'd rather integrate
those services more tightly into the PCI core. But realistically
that's wishful thinking and may never happen, so this might be the
most expedient approach.
Bjorn
On 11/29/2018 5:05 PM, Bjorn Helgaas wrote:
> On Thu, Nov 29, 2018 at 08:13:12PM +0100, Lukas Wunner wrote:
>> I guess the interrupt is shared with hotplug and PME? In that case write
>> a separate pcie_port_service_driver and request the interrupt with
>> IRQF_SHARED. Define a new service type in drivers/pci/pcie/portdrv.h.
>> Amend get_port_device_capability() to check for PCI_EXP_LNKCAP_LBNC.
>
> I really don't like the port driver design. I'd rather integrate
> those services more tightly into the PCI core. But realistically
> that's wishful thinking and may never happen, so this might be the
> most expedient approach.
So, how would it get integrated? I don't like the port service driver
either. It's too dicky on how it creates some new devices that other
drives bind to. If we could have a 1:1 mapping between service drivers
and PCI capabilities, then it might make better sense.
So, do I go the new service driver route?
Alex
A warning is generated when a PCIe device is probed with a degraded
link, but there was no similar mechanism to warn when the link becomes
degraded after probing. The Link Bandwidth Notification provides this
mechanism.
Use the link bandwidth notification interrupt to detect bandwidth
changes, and rescan the bandwidth, looking for the weakest point. This
is the same logic used in probe().
Signed-off-by: Alexandru Gagniuc <[email protected]>
---
Changes since v1:
* Layer on top of the pcie port service drivers, instead of hotplug service.
This patch needs to be applied on top of:
PCI: Add missing include to drivers/pci.h
Anticipated FAQ:
Q: Why is this unconditionally compiled in?
A: The symmetrical check in pci probe() is also always compiled in.
Q: Why call module_init() instead of adding a call in pcie_init_services() ?
A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
non-static implementation in bw_notification.c. Using module_init() is
functionally equivalent, and takes less code.
Q: Why print only on degraded links and not when a link is back to full speed?
For symmetry with PCI probe(). Although I see a benefit in conveying that a
link is back to full speed, I expect this to be extremely rare. Secondary bus
reset is usually needed to retrain at full bandwidth.
drivers/pci/pcie/Makefile | 1 +
drivers/pci/pcie/bw_notification.c | 107 +++++++++++++++++++++++++++++
drivers/pci/pcie/portdrv.h | 4 +-
drivers/pci/pcie/portdrv_core.c | 14 ++--
4 files changed, 121 insertions(+), 5 deletions(-)
create mode 100644 drivers/pci/pcie/bw_notification.c
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index ab514083d5d4..f1d7bc1e5efa 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -3,6 +3,7 @@
# Makefile for PCI Express features and port driver
pcieportdrv-y := portdrv_core.o portdrv_pci.o err.o
+pcieportdrv-y += bw_notification.o
obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o
diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
new file mode 100644
index 000000000000..64391ea9a172
--- /dev/null
+++ b/drivers/pci/pcie/bw_notification.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCI Express Bandwidth notification services driver
+ * Author: Alexandru Gagniuc <[email protected]>
+ *
+ * Copyright (C) 2018, Dell Inc
+ *
+ * The PCIe bandwidth notification provides a way to notify the operating system
+ * when the link width or data rate changes. This capability is required for all
+ * root ports and downstream ports supporting links wider than x1 and/or
+ * multiple link speeds.
+ *
+ * This service port driver hooks into the bandwidth notification interrupt and
+ * warns when links become degraded in operation.
+ */
+
+#include <linux/module.h>
+
+#include "../pci.h"
+#include "portdrv.h"
+
+static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
+{
+ int ret;
+ u32 lnk_cap;
+
+ ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
+ return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
+}
+
+static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
+{
+ u16 lnk_ctl;
+
+ pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
+ lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
+ pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
+}
+
+static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
+{
+ u16 lnk_ctl;
+
+ pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
+ lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
+ pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
+}
+
+static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
+{
+ struct pcie_device *srv = context;
+ struct pci_dev *port = srv->port;
+ struct pci_dev *dev;
+ u16 link_status, events;
+ int ret;
+
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ events = link_status & PCI_EXP_LNKSTA_LBMS;
+
+ if (!events || ret != PCIBIOS_SUCCESSFUL)
+ return IRQ_NONE;
+
+ /* Print status from upstream link partner, not this downstream port. */
+ list_for_each_entry(dev, &port->subordinate->devices, bus_list)
+ __pcie_print_link_status(dev, false);
+
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+ return IRQ_HANDLED;
+}
+
+static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
+{
+ int ret;
+
+ /* Single-width or single-speed ports do not have to support this. */
+ if (!pcie_link_bandwidth_notification_supported(srv->port))
+ return -ENODEV;
+
+ ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
+ IRQF_SHARED, "PCIe BW notif", srv);
+ if (ret)
+ return ret;
+
+ pcie_enable_link_bandwidth_notification(srv->port);
+
+ return 0;
+}
+
+static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
+{
+ pcie_disable_link_bandwidth_notification(srv->port);
+}
+
+static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
+ .name = "pcie_bw_notification",
+ .port_type = PCI_EXP_TYPE_DOWNSTREAM,
+ .service = PCIE_PORT_SERVICE_BwNOTIF,
+ .probe = pcie_bandwidth_notification_probe,
+ .remove = pcie_bandwidth_notification_remove,
+};
+
+static int __init pcie_bandwidth_notification_service_init(void)
+{
+ return pcie_port_service_register(&pcie_bandwidth_notification_driver);
+}
+
+module_init(pcie_bandwidth_notification_service_init);
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index e495f04394d0..46652469ffaa 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,10 @@
#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
#define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
#define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
+#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
+#define PCIE_PORT_SERVICE_BwNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
-#define PCIE_PORT_DEVICE_MAXSERVICES 4
+#define PCIE_PORT_DEVICE_MAXSERVICES 5
#ifdef CONFIG_PCIEAER
int pcie_aer_init(void);
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index f458ac9cb70c..86455ff7ced9 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -99,7 +99,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
*/
static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
{
- int nr_entries, nvec;
+ int nr_entries, nvec, pcie_irq;
u32 pme = 0, aer = 0, dpc = 0;
/* Allocate the maximum possible number of MSI/MSI-X vectors */
@@ -136,9 +136,12 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
}
/* PME and hotplug share an MSI/MSI-X vector */
- if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
- irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
- irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
+ if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
+ PCIE_PORT_SERVICE_BwNOTIF)) {
+ pcie_irq = pci_irq_vector(dev, pme);
+ irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
+ irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
+ irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
}
if (mask & PCIE_PORT_SERVICE_AER)
@@ -250,6 +253,9 @@ static int get_port_device_capability(struct pci_dev *dev)
pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
services |= PCIE_PORT_SERVICE_DPC;
+ if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
+ services |= PCIE_PORT_SERVICE_BwNOTIF;
+
return services;
}
--
2.17.1
On 12/7/18 12:20 PM, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
>
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().
>
> Signed-off-by: Alexandru Gagniuc <[email protected]>
> ---
ping.
> Changes since v1:
> * Layer on top of the pcie port service drivers, instead of hotplug service.
>
> This patch needs to be applied on top of:
> PCI: Add missing include to drivers/pci.h
>
> Anticipated FAQ:
>
> Q: Why is this unconditionally compiled in?
> A: The symmetrical check in pci probe() is also always compiled in.
>
> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
> non-static implementation in bw_notification.c. Using module_init() is
> functionally equivalent, and takes less code.
>
> Q: Why print only on degraded links and not when a link is back to full speed?
> For symmetry with PCI probe(). Although I see a benefit in conveying that a
> link is back to full speed, I expect this to be extremely rare. Secondary bus
> reset is usually needed to retrain at full bandwidth.
>
>
> drivers/pci/pcie/Makefile | 1 +
> drivers/pci/pcie/bw_notification.c | 107 +++++++++++++++++++++++++++++
> drivers/pci/pcie/portdrv.h | 4 +-
> drivers/pci/pcie/portdrv_core.c | 14 ++--
> 4 files changed, 121 insertions(+), 5 deletions(-)
> create mode 100644 drivers/pci/pcie/bw_notification.c
>
> diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
> index ab514083d5d4..f1d7bc1e5efa 100644
> --- a/drivers/pci/pcie/Makefile
> +++ b/drivers/pci/pcie/Makefile
> @@ -3,6 +3,7 @@
> # Makefile for PCI Express features and port driver
>
> pcieportdrv-y := portdrv_core.o portdrv_pci.o err.o
> +pcieportdrv-y += bw_notification.o
>
> obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o
>
> diff --git a/drivers/pci/pcie/bw_notification.c b/drivers/pci/pcie/bw_notification.c
> new file mode 100644
> index 000000000000..64391ea9a172
> --- /dev/null
> +++ b/drivers/pci/pcie/bw_notification.c
> @@ -0,0 +1,107 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * PCI Express Bandwidth notification services driver
> + * Author: Alexandru Gagniuc <[email protected]>
> + *
> + * Copyright (C) 2018, Dell Inc
> + *
> + * The PCIe bandwidth notification provides a way to notify the operating system
> + * when the link width or data rate changes. This capability is required for all
> + * root ports and downstream ports supporting links wider than x1 and/or
> + * multiple link speeds.
> + *
> + * This service port driver hooks into the bandwidth notification interrupt and
> + * warns when links become degraded in operation.
> + */
> +
> +#include <linux/module.h>
> +
> +#include "../pci.h"
> +#include "portdrv.h"
> +
> +static bool pcie_link_bandwidth_notification_supported(struct pci_dev *dev)
> +{
> + int ret;
> + u32 lnk_cap;
> +
> + ret = pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnk_cap);
> + return (ret == PCIBIOS_SUCCESSFUL) && (lnk_cap & PCI_EXP_LNKCAP_LBNC);
> +}
> +
> +static void pcie_enable_link_bandwidth_notification(struct pci_dev *dev)
> +{
> + u16 lnk_ctl;
> +
> + pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> + lnk_ctl |= PCI_EXP_LNKCTL_LBMIE;
> + pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> +static void pcie_disable_link_bandwidth_notification(struct pci_dev *dev)
> +{
> + u16 lnk_ctl;
> +
> + pcie_capability_read_word(dev, PCI_EXP_LNKCTL, &lnk_ctl);
> + lnk_ctl &= ~PCI_EXP_LNKCTL_LBMIE;
> + pcie_capability_write_word(dev, PCI_EXP_LNKCTL, lnk_ctl);
> +}
> +
> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> +{
> + struct pcie_device *srv = context;
> + struct pci_dev *port = srv->port;
> + struct pci_dev *dev;
> + u16 link_status, events;
> + int ret;
> +
> + ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> + events = link_status & PCI_EXP_LNKSTA_LBMS;
> +
> + if (!events || ret != PCIBIOS_SUCCESSFUL)
> + return IRQ_NONE;
> +
> + /* Print status from upstream link partner, not this downstream port. */
> + list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> + __pcie_print_link_status(dev, false);
> +
> + pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> + return IRQ_HANDLED;
> +}
> +
> +static int pcie_bandwidth_notification_probe(struct pcie_device *srv)
> +{
> + int ret;
> +
> + /* Single-width or single-speed ports do not have to support this. */
> + if (!pcie_link_bandwidth_notification_supported(srv->port))
> + return -ENODEV;
> +
> + ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
> + IRQF_SHARED, "PCIe BW notif", srv);
> + if (ret)
> + return ret;
> +
> + pcie_enable_link_bandwidth_notification(srv->port);
> +
> + return 0;
> +}
> +
> +static void pcie_bandwidth_notification_remove(struct pcie_device *srv)
> +{
> + pcie_disable_link_bandwidth_notification(srv->port);
> +}
> +
> +static struct pcie_port_service_driver pcie_bandwidth_notification_driver = {
> + .name = "pcie_bw_notification",
> + .port_type = PCI_EXP_TYPE_DOWNSTREAM,
> + .service = PCIE_PORT_SERVICE_BwNOTIF,
> + .probe = pcie_bandwidth_notification_probe,
> + .remove = pcie_bandwidth_notification_remove,
> +};
> +
> +static int __init pcie_bandwidth_notification_service_init(void)
> +{
> + return pcie_port_service_register(&pcie_bandwidth_notification_driver);
> +}
> +
> +module_init(pcie_bandwidth_notification_service_init);
> diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
> index e495f04394d0..46652469ffaa 100644
> --- a/drivers/pci/pcie/portdrv.h
> +++ b/drivers/pci/pcie/portdrv.h
> @@ -20,8 +20,10 @@
> #define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
> #define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
> #define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
> +#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
> +#define PCIE_PORT_SERVICE_BwNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
>
> -#define PCIE_PORT_DEVICE_MAXSERVICES 4
> +#define PCIE_PORT_DEVICE_MAXSERVICES 5
>
> #ifdef CONFIG_PCIEAER
> int pcie_aer_init(void);
> diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
> index f458ac9cb70c..86455ff7ced9 100644
> --- a/drivers/pci/pcie/portdrv_core.c
> +++ b/drivers/pci/pcie/portdrv_core.c
> @@ -99,7 +99,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
> */
> static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
> {
> - int nr_entries, nvec;
> + int nr_entries, nvec, pcie_irq;
> u32 pme = 0, aer = 0, dpc = 0;
>
> /* Allocate the maximum possible number of MSI/MSI-X vectors */
> @@ -136,9 +136,12 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
> }
>
> /* PME and hotplug share an MSI/MSI-X vector */
> - if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> - irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> - irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
> + if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> + PCIE_PORT_SERVICE_BwNOTIF)) {
> + pcie_irq = pci_irq_vector(dev, pme);
> + irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> + irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> + irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
> }
>
> if (mask & PCIE_PORT_SERVICE_AER)
> @@ -250,6 +253,9 @@ static int get_port_device_capability(struct pci_dev *dev)
> pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
> services |= PCIE_PORT_SERVICE_DPC;
>
> + if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
> + services |= PCIE_PORT_SERVICE_BwNOTIF;
> +
> return services;
> }
>
>
On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
> A warning is generated when a PCIe device is probed with a degraded
> link, but there was no similar mechanism to warn when the link becomes
> degraded after probing. The Link Bandwidth Notification provides this
> mechanism.
>
> Use the link bandwidth notification interrupt to detect bandwidth
> changes, and rescan the bandwidth, looking for the weakest point. This
> is the same logic used in probe().
This is unrelated to pciehp, so the subject prefix should be changed
to "PCI/portdrv".
> Q: Why is this unconditionally compiled in?
> A: The symmetrical check in pci probe() is also always compiled in.
Hm, it looks like the convention is to provide a separate Kconfig entry
for each port service.
> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
> non-static implementation in bw_notification.c. Using module_init() is
> functionally equivalent, and takes less code.
Commit c29de84149ab ("PCI: portdrv: Initialize service drivers directly")
moved away from module_init() on purpose, apparently to fix a race
condition.
> Q: Why print only on degraded links and not when a link is back to full speed?
> For symmetry with PCI probe(). Although I see a benefit in conveying that a
> link is back to full speed, I expect this to be extremely rare. Secondary bus
> reset is usually needed to retrain at full bandwidth.
What if the link is retrained at the same speed/width? Intuitively
I'd compare the speed in the Link Status Register to what is cached
in the cur_bus_speed member of struct pci_bus and print a message
only if the speed has changed. (Don't we need to cache the width as
well?)
> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
> +{
> + struct pcie_device *srv = context;
> + struct pci_dev *port = srv->port;
> + struct pci_dev *dev;
> + u16 link_status, events;
> + int ret;
> +
> + ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
> + events = link_status & PCI_EXP_LNKSTA_LBMS;
> +
> + if (!events || ret != PCIBIOS_SUCCESSFUL)
> + return IRQ_NONE;
> +
> + /* Print status from upstream link partner, not this downstream port. */
> + list_for_each_entry(dev, &port->subordinate->devices, bus_list)
> + __pcie_print_link_status(dev, false);
> +
> + pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
> + return IRQ_HANDLED;
> +}
You need to hold pci_bus_sem for the list iteration to protect against
simultaneous removal of child devices.
This may sleep, so request the IRQ with request_threaded_irq(), pass NULL
for the "handler" argument and pcie_bw_notification_irq for the "thread_fn"
argument.
You may want to call pcie_update_link_speed() to update the now stale
speed cached in the pci_bus struct.
> + ret = devm_request_irq(&srv->device, srv->irq, pcie_bw_notification_irq,
> + IRQF_SHARED, "PCIe BW notif", srv);
The plan is to move away from port services as devices in the future,
so device-managed allocations should be avoided.
Apart from that, with a device-managed IRQ, if this service driver is
unbound, its IRQ handler may still be invoked if some other port service
signals an interrupt (because the IRQ is shared). Which seems wrong.
> + .port_type = PCI_EXP_TYPE_DOWNSTREAM,
According to PCIe r4.0, sec 7.8.6, Link Bandwidth Notification Capability
is also required for root ports, so I think you need to match for
PCIE_ANY_PORT and amend pcie_link_bandwidth_notification_supported()
to check that you're dealing with a root or downstream port.
> + .service = PCIE_PORT_SERVICE_BwNOTIF,
All caps please.
> @@ -136,9 +136,12 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
> }
>
> /* PME and hotplug share an MSI/MSI-X vector */
> - if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP)) {
> - irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pci_irq_vector(dev, pme);
> - irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pci_irq_vector(dev, pme);
> + if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
> + PCIE_PORT_SERVICE_BwNOTIF)) {
> + pcie_irq = pci_irq_vector(dev, pme);
> + irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
> + irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
> + irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
Please amend the code comment with something like
- /* PME and hotplug share an MSI/MSI-X vector */
+ /* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
> @@ -250,6 +253,9 @@ static int get_port_device_capability(struct pci_dev *dev)
> pci_aer_available() && services & PCIE_PORT_SERVICE_AER)
> services |= PCIE_PORT_SERVICE_DPC;
>
> + if (pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM)
> + services |= PCIE_PORT_SERVICE_BwNOTIF;
> +
Also need to check for root ports, see above.
Otherwise LGTM.
Thanks,
Lukas
On 2/24/19 8:29 PM, Lukas Wunner wrote:
> On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
>
>
>> Q: Why is this unconditionally compiled in?
>> A: The symmetrical check in pci probe() is also always compiled in.
>
> Hm, it looks like the convention is to provide a separate Kconfig entry
> for each port service.
Does the convention still make sense in light of the symmetry reason?
>> Q: Why call module_init() instead of adding a call in pcie_init_services() ?
>> A: A call in pcie_init_services() also requires a prototype in portdrv.h, a
>> non-static implementation in bw_notification.c. Using module_init() is
>> functionally equivalent, and takes less code.
>
> Commit c29de84149ab ("PCI: portdrv: Initialize service drivers directly")
> moved away from module_init() on purpose, apparently to fix a race
> condition.
*GROWL*
> What if the link is retrained at the same speed/width? Intuitively
> I'd compare the speed in the Link Status Register to what is cached
> in the cur_bus_speed member of struct pci_bus and print a message
> only if the speed has changed. (Don't we need to cache the width as
> well?)
There are two mechanisms to bring a degraded link back to full BW.
1. Secondary bus reset, which results in the device being tore down
along with our cached speed value.
2. Set the PCI_EXP_LNKCTL_LD (Link Disable) bit and clear it. We do
that as part of the pciehp teardown path. We'd lose our cached value
just like in the first case.
>> +static irqreturn_t pcie_bw_notification_irq(int irq, void *context)
>> [...]
>
> You need to hold pci_bus_sem [...]
> This may sleep, so request the IRQ with request_threaded_irq() [...]
Good catch! Thanks!
All other issues you pointed out should be resolved in next version.
Alex
On Wed, Feb 27, 2019 at 08:21:58PM +0000, [email protected] wrote:
> On 2/24/19 8:29 PM, Lukas Wunner wrote:
> > On Fri, Dec 07, 2018 at 12:20:00PM -0600, Alexandru Gagniuc wrote:
> > > Q: Why is this unconditionally compiled in?
> > > A: The symmetrical check in pci probe() is also always compiled in.
> >
> > Hm, it looks like the convention is to provide a separate Kconfig entry
> > for each port service.
>
> Does the convention still make sense in light of the symmetry reason?
I don't know. In the past we had OpenWRT/LEDE folks complain that
the PCI core is hogging too much memory on their space-constrained
Mips routers, so they #ifdef'ed a lot of x86-specific stuff in
quirks.c. In the same vein they could argue that they'd like to
disable unneeded PCIe services. That space argument and possibly
the reduction of attack surface on Linux firewalls is the only
justification I can come up with for a separate Kconfig option for
bandwidth notification.
Thanks,
Lukas