This series implements shutdown & reinitialization of the PCIe RC on
system suspend. Tested on 8280-crd.
Changes in v2:
* Rebase
* Get rid of "Cache last icc bandwidth", use icc_enable instead
* Don't permanently assert reset on clk enable fail in "Reshuffle reset.."
* Drop fixes tag in "Reshuffle reset.."
* Improve commit messages of "Reshuffle reset.." and "Implement RC shutdown.."
* Only set icc tag on RPMh SoCs
* Pick up rb
Link to v1: https://lore.kernel.org/linux-arm-msm/[email protected]/
Signed-off-by: Konrad Dybcio <[email protected]>
---
Konrad Dybcio (3):
PCI: qcom: reshuffle reset logic in 2_7_0 .init
PCI: qcom: Read back PARF_LTSSM register
PCI: qcom: properly implement RC shutdown/power up
drivers/pci/controller/dwc/Kconfig | 1 +
drivers/pci/controller/dwc/pcie-qcom.c | 178 +++++++++++++++++++++++++--------
2 files changed, 135 insertions(+), 44 deletions(-)
---
base-commit: 445a555e0623387fa9b94e68e61681717e70200a
change-id: 20240210-topic-8280_pcie-c94f58158029
Best regards,
--
Konrad Dybcio <[email protected]>
To ensure write completion, read the PARF_LTSSM register after setting
the LTSSM enable bit before polling for "link up".
Reviewed-by: Manivannan Sadhasivam <[email protected]>
Signed-off-by: Konrad Dybcio <[email protected]>
---
drivers/pci/controller/dwc/pcie-qcom.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index cbde9effa352..6a469ed213ce 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -539,6 +539,7 @@ static void qcom_pcie_2_3_2_ltssm_enable(struct qcom_pcie *pcie)
val = readl(pcie->parf + PARF_LTSSM);
val |= LTSSM_EN;
writel(val, pcie->parf + PARF_LTSSM);
+ readl(pcie->parf + PARF_LTSSM);
}
static int qcom_pcie_get_resources_2_3_2(struct qcom_pcie *pcie)
--
2.40.1
Currently, we've only been minimizing the power draw while keeping the
RC up at all times. This is suboptimal, as it draws a whole lot of power
and prevents the SoC from power collapsing.
Implement full shutdown and re-initialization to allow for powering off
the controller.
This is mainly indended for SC8280XP with a broken power rail setup,
which requires a full RC shutdown/reinit in order to reach SoC-wide
power collapse, but sleeping is generally better than not sleeping and
less destructive suspend can be implemented later for platforms that
support it.
Co-developed-by: Bjorn Andersson <[email protected]>
Signed-off-by: Konrad Dybcio <[email protected]>
---
drivers/pci/controller/dwc/Kconfig | 1 +
drivers/pci/controller/dwc/pcie-qcom.c | 159 ++++++++++++++++++++++++++-------
2 files changed, 126 insertions(+), 34 deletions(-)
diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig
index 8afacc90c63b..4ce266951cb6 100644
--- a/drivers/pci/controller/dwc/Kconfig
+++ b/drivers/pci/controller/dwc/Kconfig
@@ -268,6 +268,7 @@ config PCIE_DW_PLAT_EP
config PCIE_QCOM
bool "Qualcomm PCIe controller (host mode)"
depends on OF && (ARCH_QCOM || COMPILE_TEST)
+ depends on QCOM_COMMAND_DB || QCOM_COMMAND_DB=n
depends on PCI_MSI
select PCIE_DW_HOST
select CRC8
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index 6a469ed213ce..c807833ee4a7 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -30,13 +30,18 @@
#include <linux/reset.h>
#include <linux/slab.h>
#include <linux/types.h>
+#include <soc/qcom/cmd-db.h>
#include "../../pci.h"
#include "pcie-designware.h"
+#include <dt-bindings/interconnect/qcom,icc.h>
+#include <dt-bindings/interconnect/qcom,rpm-icc.h>
+
/* PARF registers */
#define PARF_SYS_CTRL 0x00
#define PARF_PM_CTRL 0x20
+#define PARF_PM_STTS 0x24
#define PARF_PCS_DEEMPH 0x34
#define PARF_PCS_SWING 0x38
#define PARF_PHY_CTRL 0x40
@@ -80,7 +85,10 @@
#define L1_CLK_RMV_DIS BIT(1)
/* PARF_PM_CTRL register fields */
-#define REQ_NOT_ENTR_L1 BIT(5)
+#define REQ_NOT_ENTR_L1 BIT(5) /* "Prevent L0->L1" */
+
+/* PARF_PM_STTS register fields */
+#define PM_ENTER_L23 BIT(5)
/* PARF_PCS_DEEMPH register fields */
#define PCS_DEEMPH_TX_DEEMPH_GEN1(x) FIELD_PREP(GENMASK(21, 16), x)
@@ -122,6 +130,7 @@
/* ELBI_SYS_CTRL register fields */
#define ELBI_SYS_CTRL_LT_ENABLE BIT(0)
+#define ELBI_SYS_CTRL_PME_TURNOFF_MSG BIT(4)
/* AXI_MSTR_RESP_COMP_CTRL0 register fields */
#define CFG_REMOTE_RD_REQ_BRIDGE_SIZE_2K 0x4
@@ -243,6 +252,7 @@ struct qcom_pcie {
const struct qcom_pcie_cfg *cfg;
struct dentry *debugfs;
bool suspended;
+ bool soc_is_rpmh;
};
#define to_qcom_pcie(x) dev_get_drvdata((x)->dev)
@@ -272,6 +282,24 @@ static int qcom_pcie_start_link(struct dw_pcie *pci)
return 0;
}
+static int qcom_pcie_stop_link(struct dw_pcie *pci)
+{
+ struct qcom_pcie *pcie = to_qcom_pcie(pci);
+ u32 ret_l23, val;
+
+ writel(ELBI_SYS_CTRL_PME_TURNOFF_MSG, pcie->elbi + ELBI_SYS_CTRL);
+ readl(pcie->elbi + ELBI_SYS_CTRL);
+
+ ret_l23 = readl_poll_timeout(pcie->parf + PARF_PM_STTS, val,
+ val & PM_ENTER_L23, 10000, 100000);
+ if (ret_l23) {
+ dev_err(pci->dev, "Failed to enter L2/L3\n");
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
static void qcom_pcie_clear_hpc(struct dw_pcie *pci)
{
u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
@@ -987,9 +1015,19 @@ static void qcom_pcie_host_post_init_2_7_0(struct qcom_pcie *pcie)
static void qcom_pcie_deinit_2_7_0(struct qcom_pcie *pcie)
{
struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0;
+ u32 val;
+
+ /* Disable PCIe clocks and resets */
+ val = readl(pcie->parf + PARF_PHY_CTRL);
+ val |= PHY_TEST_PWR_DOWN;
+ writel(val, pcie->parf + PARF_PHY_CTRL);
+ readl(pcie->parf + PARF_PHY_CTRL);
clk_bulk_disable_unprepare(res->num_clks, res->clks);
+ reset_control_assert(res->rst);
+ usleep_range(2000, 2500);
+
regulator_bulk_disable(ARRAY_SIZE(res->supplies), res->supplies);
}
@@ -1545,6 +1583,9 @@ static int qcom_pcie_probe(struct platform_device *pdev)
goto err_phy_exit;
}
+ /* If the soc features RPMh, cmd_db must have been prepared by now */
+ pcie->soc_is_rpmh = !cmd_db_ready();
+
qcom_pcie_icc_update(pcie);
if (pcie->mhi)
@@ -1561,58 +1602,108 @@ static int qcom_pcie_probe(struct platform_device *pdev)
return ret;
}
-static int qcom_pcie_suspend_noirq(struct device *dev)
+static int qcom_pcie_resume_noirq(struct device *dev)
{
struct qcom_pcie *pcie = dev_get_drvdata(dev);
int ret;
- /*
- * Set minimum bandwidth required to keep data path functional during
- * suspend.
- */
- ret = icc_set_bw(pcie->icc_mem, 0, kBps_to_icc(1));
- if (ret) {
- dev_err(dev, "Failed to set interconnect bandwidth: %d\n", ret);
- return ret;
+ if (pcie->soc_is_rpmh) {
+ /*
+ * Undo the tag change from qcom_pcie_suspend_noirq first in
+ * case RPMh spontaneously decides to power collapse the
+ * platform based on other inputs.
+ */
+ icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_ALWAYS);
+
+ /* Flush the tag change */
+ ret = icc_enable(pcie->icc_mem);
+ if (ret) {
+ dev_err(pcie->pci->dev, "failed to icc_enable: %d", ret);
+ return ret;
+ }
}
- /*
- * Turn OFF the resources only for controllers without active PCIe
- * devices. For controllers with active devices, the resources are kept
- * ON and the link is expected to be in L0/L1 (sub)states.
- *
- * Turning OFF the resources for controllers with active PCIe devices
- * will trigger access violation during the end of the suspend cycle,
- * as kernel tries to access the PCIe devices config space for masking
- * MSIs.
- *
- * Also, it is not desirable to put the link into L2/L3 state as that
- * implies VDD supply will be removed and the devices may go into
- * powerdown state. This will affect the lifetime of the storage devices
- * like NVMe.
- */
- if (!dw_pcie_link_up(pcie->pci)) {
- qcom_pcie_host_deinit(&pcie->pci->pp);
- pcie->suspended = true;
- }
+ /* Only check this now to make sure the icc tag has been set. */
+ if (!pcie->suspended)
+ return 0;
+
+ ret = qcom_pcie_host_init(&pcie->pci->pp);
+ if (ret)
+ goto revert_icc_tag;
+
+ dw_pcie_setup_rc(&pcie->pci->pp);
+
+ ret = qcom_pcie_start_link(pcie->pci);
+ if (ret)
+ goto deinit_host;
+
+ /* Ignore the retval, the devices may come up later. */
+ dw_pcie_wait_for_link(pcie->pci);
+
+ qcom_pcie_icc_update(pcie);
+
+ pcie->suspended = false;
return 0;
+
+deinit_host:
+ qcom_pcie_host_deinit(&pcie->pci->pp);
+revert_icc_tag:
+ if (pcie->soc_is_rpmh) {
+ icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_WAKE);
+
+ /* Ignore the retval, failing here would be tragic anyway.. */
+ icc_enable(pcie->icc_mem);
+ }
+
+ return ret;
}
-static int qcom_pcie_resume_noirq(struct device *dev)
+static int qcom_pcie_suspend_noirq(struct device *dev)
{
struct qcom_pcie *pcie = dev_get_drvdata(dev);
int ret;
- if (pcie->suspended) {
- ret = qcom_pcie_host_init(&pcie->pci->pp);
+ if (pcie->suspended)
+ return 0;
+
+ if (dw_pcie_link_up(pcie->pci)) {
+ ret = qcom_pcie_stop_link(pcie->pci);
if (ret)
return ret;
+ }
- pcie->suspended = false;
+ qcom_pcie_host_deinit(&pcie->pci->pp);
+
+ if (pcie->soc_is_rpmh) {
+ /*
+ * The PCIe RC may be covertly accessed by the secure firmware
+ * on sleep exit. Use the WAKE bucket to let RPMh pull the plug
+ * on PCIe in sleep, but guarantee it comes back up for resume.
+ */
+ icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_WAKE);
+
+ /* Flush the tag change */
+ ret = icc_enable(pcie->icc_mem);
+ if (ret) {
+ dev_err(pcie->pci->dev, "failed to icc_enable %d\n", ret);
+
+ /* Revert everything and pray icc calls succeed */
+ return qcom_pcie_resume_noirq(dev);
+ }
+ } else {
+ /*
+ * Set minimum bandwidth required to keep data path functional
+ * during suspend.
+ */
+ ret = icc_set_bw(pcie->icc_mem, 0, kBps_to_icc(1));
+ if (ret) {
+ dev_err(dev, "Failed to set interconnect bandwidth: %d\n", ret);
+ return ret;
+ }
}
- qcom_pcie_icc_update(pcie);
+ pcie->suspended = true;
return 0;
}
--
2.40.1
At least on SC8280XP, if the PCIe reset is asserted, the corresponding
AUX_CLK will be stuck at 'off'. This has not been an issue so far,
since the reset is both left de-asserted by the previous boot stages
and the driver only toggles it briefly in .init.
As part of the upcoming suspend prodecure however, the reset will be
held asserted.
Assert the reset (which may end up being a NOP in some cases) and
de-assert it back *before* turning on the clocks in preparation for
introducing RC powerdown and reinitialization.
Signed-off-by: Konrad Dybcio <[email protected]>
---
drivers/pci/controller/dwc/pcie-qcom.c | 18 ++++++++----------
1 file changed, 8 insertions(+), 10 deletions(-)
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index 2ce2a3bd932b..cbde9effa352 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -900,27 +900,27 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
return ret;
}
- ret = clk_bulk_prepare_enable(res->num_clks, res->clks);
- if (ret < 0)
- goto err_disable_regulators;
-
+ /* Assert the reset to hold the RC in a known state */
ret = reset_control_assert(res->rst);
if (ret) {
dev_err(dev, "reset assert failed (%d)\n", ret);
- goto err_disable_clocks;
+ goto err_disable_regulators;
}
-
usleep_range(1000, 1500);
+ /* GCC_PCIE_n_AUX_CLK won't come up if the reset is asserted */
ret = reset_control_deassert(res->rst);
if (ret) {
dev_err(dev, "reset deassert failed (%d)\n", ret);
- goto err_disable_clocks;
+ goto err_disable_regulators;
}
-
/* Wait for reset to complete, required on SM8450 */
usleep_range(1000, 1500);
+ ret = clk_bulk_prepare_enable(res->num_clks, res->clks);
+ if (ret < 0)
+ goto err_disable_regulators;
+
/* configure PCIe to RC mode */
writel(DEVICE_TYPE_RC, pcie->parf + PARF_DEVICE_TYPE);
@@ -951,8 +951,6 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
writel(val, pcie->parf + PARF_AXI_MSTR_WR_ADDR_HALT_V2);
return 0;
-err_disable_clocks:
- clk_bulk_disable_unprepare(res->num_clks, res->clks);
err_disable_regulators:
regulator_bulk_disable(ARRAY_SIZE(res->supplies), res->supplies);
--
2.40.1
Would be nice to have a hint in the subject line about what this does.
Also capitalize to match the others ("PCI: qcom: <Capitalized verb>").
On Sat, Feb 10, 2024 at 06:10:05PM +0100, Konrad Dybcio wrote:
> At least on SC8280XP, if the PCIe reset is asserted, the corresponding
> AUX_CLK will be stuck at 'off'. This has not been an issue so far,
> since the reset is both left de-asserted by the previous boot stages
> and the driver only toggles it briefly in .init.
>
> As part of the upcoming suspend prodecure however, the reset will be
> held asserted.
s/prodecure/procedure/
> Assert the reset (which may end up being a NOP in some cases) and
> de-assert it back *before* turning on the clocks in preparation for
> introducing RC powerdown and reinitialization.
Bjorn
Maybe include the reason in the subject? "Read back" is literally
what the diff says.
On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
> To ensure write completion, read the PARF_LTSSM register after setting
> the LTSSM enable bit before polling for "link up".
The write will obviously complete *some* time; I assume the point is
that it's important for it to complete before some other event, and it
would be nice to know why that's important.
> Reviewed-by: Manivannan Sadhasivam <[email protected]>
> Signed-off-by: Konrad Dybcio <[email protected]>
> ---
> drivers/pci/controller/dwc/pcie-qcom.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> index cbde9effa352..6a469ed213ce 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> @@ -539,6 +539,7 @@ static void qcom_pcie_2_3_2_ltssm_enable(struct qcom_pcie *pcie)
> val = readl(pcie->parf + PARF_LTSSM);
> val |= LTSSM_EN;
> writel(val, pcie->parf + PARF_LTSSM);
> + readl(pcie->parf + PARF_LTSSM);
> }
>
> static int qcom_pcie_get_resources_2_3_2(struct qcom_pcie *pcie)
>
> --
> 2.40.1
>
"Properly" is a noise word that suggests "we're doing it right this
time" but doesn't hint at what actually makes this better.
On Sat, Feb 10, 2024 at 06:10:07PM +0100, Konrad Dybcio wrote:
> Currently, we've only been minimizing the power draw while keeping the
> RC up at all times. This is suboptimal, as it draws a whole lot of power
> and prevents the SoC from power collapsing.
Is "power collapse" a technical term specific to this device, or is
there some more common term that could be used? I assume the fact
that the RC remains powered precludes some lower power state of the
entire SoC?
> Implement full shutdown and re-initialization to allow for powering off
> the controller.
>
> This is mainly indended for SC8280XP with a broken power rail setup,
> which requires a full RC shutdown/reinit in order to reach SoC-wide
> power collapse, but sleeping is generally better than not sleeping and
> less destructive suspend can be implemented later for platforms that
> support it.
s/indended/intended/
> config PCIE_QCOM
> bool "Qualcomm PCIe controller (host mode)"
> depends on OF && (ARCH_QCOM || COMPILE_TEST)
> + depends on QCOM_COMMAND_DB || QCOM_COMMAND_DB=n
Just out of curiosity since I'm not a Kconfig expert, what does
"depends on X || X=n" mean?
I guess it's different from
"depends on (QCOM_COMMAND_DB || !QCOM_COMMAND_DB)", which I also see
used for QCOM_RPMH?
Does this reduce compile testing? I see COMPILE_TEST mentioned in a
few other QCOM_COMMAND_DB dependencies.
> + ret_l23 = readl_poll_timeout(pcie->parf + PARF_PM_STTS, val,
> + val & PM_ENTER_L23, 10000, 100000);
Are these timeout values rooted in some PCIe or Qcom spec? Would be
nice to have a spec citation or other reason for choosing these
values.
> + reset_control_assert(res->rst);
> + usleep_range(2000, 2500);
Ditto, some kind of citation would be nice.
Bjorn
On 12.02.2024 22:32, Bjorn Helgaas wrote:
> "Properly" is a noise word that suggests "we're doing it right this
> time" but doesn't hint at what actually makes this better.
>
> On Sat, Feb 10, 2024 at 06:10:07PM +0100, Konrad Dybcio wrote:
>> Currently, we've only been minimizing the power draw while keeping the
>> RC up at all times. This is suboptimal, as it draws a whole lot of power
>> and prevents the SoC from power collapsing.
>
> Is "power collapse" a technical term specific to this device, or is
> there some more common term that could be used? I assume the fact
> that the RC remains powered precludes some lower power state of the
> entire SoC?
That's spot on, "power collapse" commonly refers to shutting down as many
parts of the SoC as possible, in order to achieve miliwatt-order power draw.
>
>> Implement full shutdown and re-initialization to allow for powering off
>> the controller.
>>
>> This is mainly indended for SC8280XP with a broken power rail setup,
>> which requires a full RC shutdown/reinit in order to reach SoC-wide
>> power collapse, but sleeping is generally better than not sleeping and
>> less destructive suspend can be implemented later for platforms that
>> support it.
>
> s/indended/intended/
>
>> config PCIE_QCOM
>> bool "Qualcomm PCIe controller (host mode)"
>> depends on OF && (ARCH_QCOM || COMPILE_TEST)
>> + depends on QCOM_COMMAND_DB || QCOM_COMMAND_DB=n
>
> Just out of curiosity since I'm not a Kconfig expert, what does
> "depends on X || X=n" mean?
"not a module"
>
> I guess it's different from
> "depends on (QCOM_COMMAND_DB || !QCOM_COMMAND_DB)", which I also see
> used for QCOM_RPMH?
Yep
>
> Does this reduce compile testing? I see COMPILE_TEST mentioned in a
> few other QCOM_COMMAND_DB dependencies.
I can add "&& COMPILE_TEST", yeah
>
>> + ret_l23 = readl_poll_timeout(pcie->parf + PARF_PM_STTS, val,
>> + val & PM_ENTER_L23, 10000, 100000);
>
> Are these timeout values rooted in some PCIe or Qcom spec? Would be
> nice to have a spec citation or other reason for choosing these
> values.
>
>> + reset_control_assert(res->rst);
>> + usleep_range(2000, 2500);
>
> Ditto, some kind of citation would be nice.
Both are magic values coming from Qualcomm BSP, that we suppose
we can safely assume (and that's a two-level assumption at this
point, I know..) is going to work fine, as it does so on millions
of shipped devices.
Maybe Mani or Bjorn A can find something interesting in the documentation.
Konrad
On 12.02.2024 22:17, Bjorn Helgaas wrote:
> Maybe include the reason in the subject? "Read back" is literally
> what the diff says.
>
> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
>> To ensure write completion, read the PARF_LTSSM register after setting
>> the LTSSM enable bit before polling for "link up".
>
> The write will obviously complete *some* time; I assume the point is
> that it's important for it to complete before some other event, and it
> would be nice to know why that's important.
Right, that's very much meaningful on non-total-store-ordering
architectures, like arm64, where the CPU receives a store instruction,
but that does not necessarily impact the memory/MMIO state immediately.
Konrad
On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
> On 12.02.2024 22:17, Bjorn Helgaas wrote:
> > Maybe include the reason in the subject? "Read back" is literally
> > what the diff says.
> >
> > On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
> >> To ensure write completion, read the PARF_LTSSM register after setting
> >> the LTSSM enable bit before polling for "link up".
> >
> > The write will obviously complete *some* time; I assume the point is
> > that it's important for it to complete before some other event, and it
> > would be nice to know why that's important.
>
> Right, that's very much meaningful on non-total-store-ordering
> architectures, like arm64, where the CPU receives a store instruction,
> but that does not necessarily impact the memory/MMIO state immediately.
I was hinting that maybe we could say what the other event is, or what
problem this solves? E.g., maybe it's as simple as "there's no point
in polling for link up until after the PARF_LTSSM store completes."
But while the read of PARF_LTSSM might reduce the number of "is the
link up" polls, it probably wouldn't speed anything up otherwise, so I
suspect there's an actual functional reason for this patch, and that's
what I'm getting at.
Bjorn
On Wed, Feb 14, 2024 at 10:33:19PM +0100, Konrad Dybcio wrote:
> On 12.02.2024 22:32, Bjorn Helgaas wrote:
> > "Properly" is a noise word that suggests "we're doing it right this
> > time" but doesn't hint at what actually makes this better.
> >
> > On Sat, Feb 10, 2024 at 06:10:07PM +0100, Konrad Dybcio wrote:
> >> Currently, we've only been minimizing the power draw while keeping the
> >> RC up at all times. This is suboptimal, as it draws a whole lot of power
> >> and prevents the SoC from power collapsing.
> >
> > Is "power collapse" a technical term specific to this device, or is
> > there some more common term that could be used? I assume the fact
> > that the RC remains powered precludes some lower power state of the
> > entire SoC?
>
> That's spot on, "power collapse" commonly refers to shutting down as many
> parts of the SoC as possible, in order to achieve miliwatt-order power draw.
I'm pretty sure "power collapse" is a Qualcomm:ism so better to use
common terminology as Bjorn suggested, and maybe put the Qualcomm
wording in parenthesis or similar.
Johan
On 14.02.2024 23:28, Bjorn Helgaas wrote:
> On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
>> On 12.02.2024 22:17, Bjorn Helgaas wrote:
>>> Maybe include the reason in the subject? "Read back" is literally
>>> what the diff says.
>>>
>>> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
>>>> To ensure write completion, read the PARF_LTSSM register after setting
>>>> the LTSSM enable bit before polling for "link up".
>>>
>>> The write will obviously complete *some* time; I assume the point is
>>> that it's important for it to complete before some other event, and it
>>> would be nice to know why that's important.
>>
>> Right, that's very much meaningful on non-total-store-ordering
>> architectures, like arm64, where the CPU receives a store instruction,
>> but that does not necessarily impact the memory/MMIO state immediately.
>
> I was hinting that maybe we could say what the other event is, or what
> problem this solves? E.g., maybe it's as simple as "there's no point
> in polling for link up until after the PARF_LTSSM store completes."
>
> But while the read of PARF_LTSSM might reduce the number of "is the
> link up" polls, it probably wouldn't speed anything up otherwise, so I
> suspect there's an actual functional reason for this patch, and that's
> what I'm getting at.
So, the register containing the "enable switch" (PARF_LTSSM) can (due
to the armv8 memory model) be "written" but not "change the value of
memory/mmio from the perspective of other (non-CPU) memory-readers
(such as the MMIO-mapped PCI controller itself)".
In that case, the CPU will happily continue calling qcom_pcie_link_up()
in a loop, waiting for the PCIe controller to bring the link up, however
the PCIE controller may have never received the PARF_LTSSM "enable link"
write by the time we decide to time out on checking the link status.
It may also never happen for you, but that's exactly like a classic race
condition, where it may simply not manifest due to the code around the
problematic lines hiding it. It may also only manifest on certain CPU
cores that try to be smarter than you and keep reordering/delaying
instructions if they don't seem immediately necessary.
Konrad
On 15.02.2024 08:13, Johan Hovold wrote:
> On Wed, Feb 14, 2024 at 10:33:19PM +0100, Konrad Dybcio wrote:
>> On 12.02.2024 22:32, Bjorn Helgaas wrote:
>>> "Properly" is a noise word that suggests "we're doing it right this
>>> time" but doesn't hint at what actually makes this better.
>>>
>>> On Sat, Feb 10, 2024 at 06:10:07PM +0100, Konrad Dybcio wrote:
>>>> Currently, we've only been minimizing the power draw while keeping the
>>>> RC up at all times. This is suboptimal, as it draws a whole lot of power
>>>> and prevents the SoC from power collapsing.
>>>
>>> Is "power collapse" a technical term specific to this device, or is
>>> there some more common term that could be used? I assume the fact
>>> that the RC remains powered precludes some lower power state of the
>>> entire SoC?
>>
>> That's spot on, "power collapse" commonly refers to shutting down as many
>> parts of the SoC as possible, in order to achieve miliwatt-order power draw.
>
> I'm pretty sure "power collapse" is a Qualcomm:ism so better to use
> common terminology as Bjorn suggested, and maybe put the Qualcomm
> wording in parenthesis or similar.
Ok, I keep hearing it so much that I had previously assumed it's the
standard way of describing it.. I'll reword this.
Konrad
On 12.02.2024 22:14, Bjorn Helgaas wrote:
> Would be nice to have a hint in the subject line about what this does.
> Also capitalize to match the others ("PCI: qcom: <Capitalized verb>").
>
> On Sat, Feb 10, 2024 at 06:10:05PM +0100, Konrad Dybcio wrote:
>> At least on SC8280XP, if the PCIe reset is asserted, the corresponding
>> AUX_CLK will be stuck at 'off'. This has not been an issue so far,
>> since the reset is both left de-asserted by the previous boot stages
>> and the driver only toggles it briefly in .init.
>>
>> As part of the upcoming suspend prodecure however, the reset will be
>> held asserted.
>
> s/prodecure/procedure/
Before I get around to resending, does the commit look fine otherwise?
Konrad
On Thu, Feb 15, 2024 at 11:21:45AM +0100, Konrad Dybcio wrote:
> On 14.02.2024 23:28, Bjorn Helgaas wrote:
> > On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
> >> On 12.02.2024 22:17, Bjorn Helgaas wrote:
> >>> Maybe include the reason in the subject? "Read back" is literally
> >>> what the diff says.
> >>>
> >>> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
> >>>> To ensure write completion, read the PARF_LTSSM register after setting
> >>>> the LTSSM enable bit before polling for "link up".
> >>>
> >>> The write will obviously complete *some* time; I assume the point is
> >>> that it's important for it to complete before some other event, and it
> >>> would be nice to know why that's important.
> >>
> >> Right, that's very much meaningful on non-total-store-ordering
> >> architectures, like arm64, where the CPU receives a store instruction,
> >> but that does not necessarily impact the memory/MMIO state immediately.
> >
> > I was hinting that maybe we could say what the other event is, or what
> > problem this solves? E.g., maybe it's as simple as "there's no point
> > in polling for link up until after the PARF_LTSSM store completes."
> >
> > But while the read of PARF_LTSSM might reduce the number of "is the
> > link up" polls, it probably wouldn't speed anything up otherwise, so I
> > suspect there's an actual functional reason for this patch, and that's
> > what I'm getting at.
>
> So, the register containing the "enable switch" (PARF_LTSSM) can (due
> to the armv8 memory model) be "written" but not "change the value of
> memory/mmio from the perspective of other (non-CPU) memory-readers
> (such as the MMIO-mapped PCI controller itself)".
>
> In that case, the CPU will happily continue calling qcom_pcie_link_up()
> in a loop, waiting for the PCIe controller to bring the link up, however
> the PCIE controller may have never received the PARF_LTSSM "enable link"
> write by the time we decide to time out on checking the link status.
>
> It may also never happen for you, but that's exactly like a classic race
> condition, where it may simply not manifest due to the code around the
> problematic lines hiding it. It may also only manifest on certain CPU
> cores that try to be smarter than you and keep reordering/delaying
> instructions if they don't seem immediately necessary.
Does this mean the register is mapped incorrectly, e.g., I see arm64
has many different kinds of mappings for cacheability,
write-buffering, etc?
Or, if it is already mapped correctly, are we confident that none of
the *other* register writes need similar treatment? Is there a rule
we can apply to know when the read-after-write is needed?
Bjorn
On 15.02.2024 17:11, Bjorn Helgaas wrote:
> On Thu, Feb 15, 2024 at 11:21:45AM +0100, Konrad Dybcio wrote:
>> On 14.02.2024 23:28, Bjorn Helgaas wrote:
>>> On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
>>>> On 12.02.2024 22:17, Bjorn Helgaas wrote:
>>>>> Maybe include the reason in the subject? "Read back" is literally
>>>>> what the diff says.
>>>>>
>>>>> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
>>>>>> To ensure write completion, read the PARF_LTSSM register after setting
>>>>>> the LTSSM enable bit before polling for "link up".
>>>>>
>>>>> The write will obviously complete *some* time; I assume the point is
>>>>> that it's important for it to complete before some other event, and it
>>>>> would be nice to know why that's important.
>>>>
>>>> Right, that's very much meaningful on non-total-store-ordering
>>>> architectures, like arm64, where the CPU receives a store instruction,
>>>> but that does not necessarily impact the memory/MMIO state immediately.
>>>
>>> I was hinting that maybe we could say what the other event is, or what
>>> problem this solves? E.g., maybe it's as simple as "there's no point
>>> in polling for link up until after the PARF_LTSSM store completes."
>>>
>>> But while the read of PARF_LTSSM might reduce the number of "is the
>>> link up" polls, it probably wouldn't speed anything up otherwise, so I
>>> suspect there's an actual functional reason for this patch, and that's
>>> what I'm getting at.
>>
>> So, the register containing the "enable switch" (PARF_LTSSM) can (due
>> to the armv8 memory model) be "written" but not "change the value of
>> memory/mmio from the perspective of other (non-CPU) memory-readers
>> (such as the MMIO-mapped PCI controller itself)".
>>
>> In that case, the CPU will happily continue calling qcom_pcie_link_up()
>> in a loop, waiting for the PCIe controller to bring the link up, however
>> the PCIE controller may have never received the PARF_LTSSM "enable link"
>> write by the time we decide to time out on checking the link status.
>>
>> It may also never happen for you, but that's exactly like a classic race
>> condition, where it may simply not manifest due to the code around the
>> problematic lines hiding it. It may also only manifest on certain CPU
>> cores that try to be smarter than you and keep reordering/delaying
>> instructions if they don't seem immediately necessary.
>
> Does this mean the register is mapped incorrectly, e.g., I see arm64
> has many different kinds of mappings for cacheability,
> write-buffering, etc?
No, it's correctly mapped as "device" memory, but even that gives no
guarantees about when the writes are "flushed" out of the CPU/cluster
>
> Or, if it is already mapped correctly, are we confident that none of
> the *other* register writes need similar treatment? Is there a rule
> we can apply to know when the read-after-write is needed?
Generally, it's a good idea to add such readbacks after all timing-
critical writes, especially when they concern asserting reset,
enabling/disabling power, etc., to make sure we're not assuming the
hardware state of a peripheral has changed before we ask it to do so.
Konrad
On Thu, Feb 15, 2024 at 07:44:27PM +0100, Konrad Dybcio wrote:
> On 15.02.2024 17:11, Bjorn Helgaas wrote:
> > On Thu, Feb 15, 2024 at 11:21:45AM +0100, Konrad Dybcio wrote:
> >> On 14.02.2024 23:28, Bjorn Helgaas wrote:
> >>> On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
> >>>> On 12.02.2024 22:17, Bjorn Helgaas wrote:
> >>>>> Maybe include the reason in the subject? "Read back" is literally
> >>>>> what the diff says.
> >>>>>
> >>>>> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
> >>>>>> To ensure write completion, read the PARF_LTSSM register after setting
> >>>>>> the LTSSM enable bit before polling for "link up".
> >>>>>
> >>>>> The write will obviously complete *some* time; I assume the point is
> >>>>> that it's important for it to complete before some other event, and it
> >>>>> would be nice to know why that's important.
> >>>>
> >>>> Right, that's very much meaningful on non-total-store-ordering
> >>>> architectures, like arm64, where the CPU receives a store instruction,
> >>>> but that does not necessarily impact the memory/MMIO state immediately.
> >>>
> >>> I was hinting that maybe we could say what the other event is, or what
> >>> problem this solves? E.g., maybe it's as simple as "there's no point
> >>> in polling for link up until after the PARF_LTSSM store completes."
> >>>
> >>> But while the read of PARF_LTSSM might reduce the number of "is the
> >>> link up" polls, it probably wouldn't speed anything up otherwise, so I
> >>> suspect there's an actual functional reason for this patch, and that's
> >>> what I'm getting at.
> >>
> >> So, the register containing the "enable switch" (PARF_LTSSM) can (due
> >> to the armv8 memory model) be "written" but not "change the value of
> >> memory/mmio from the perspective of other (non-CPU) memory-readers
> >> (such as the MMIO-mapped PCI controller itself)".
> >>
> >> In that case, the CPU will happily continue calling qcom_pcie_link_up()
> >> in a loop, waiting for the PCIe controller to bring the link up, however
> >> the PCIE controller may have never received the PARF_LTSSM "enable link"
> >> write by the time we decide to time out on checking the link status.
This makes no sense. As Bjorn already said, you're just polling for the
link to come up (for a second). And unless you have something else that
depends on the write to have reached the device, there is no need to
read it back. It's not going to be cached indefinitely if that's what
you fear.
> Generally, it's a good idea to add such readbacks after all timing-
> critical writes, especially when they concern asserting reset,
> enabling/disabling power, etc., to make sure we're not assuming the
> hardware state of a peripheral has changed before we ask it to do so.
Again no, there is no general need to do that. It all depends on what
the code does and how the device works.
Johan
On 2/10/2024 10:40 PM, Konrad Dybcio wrote:
> Currently, we've only been minimizing the power draw while keeping the
> RC up at all times. This is suboptimal, as it draws a whole lot of power
> and prevents the SoC from power collapsing.
>
> Implement full shutdown and re-initialization to allow for powering off
> the controller.
>
> This is mainly indended for SC8280XP with a broken power rail setup,
> which requires a full RC shutdown/reinit in order to reach SoC-wide
> power collapse, but sleeping is generally better than not sleeping and
> less destructive suspend can be implemented later for platforms that
> support it.
>
> Co-developed-by: Bjorn Andersson <[email protected]>
> Signed-off-by: Konrad Dybcio <[email protected]>
> ---
> drivers/pci/controller/dwc/Kconfig | 1 +
> drivers/pci/controller/dwc/pcie-qcom.c | 159 ++++++++++++++++++++++++++-------
> 2 files changed, 126 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig
> index 8afacc90c63b..4ce266951cb6 100644
> --- a/drivers/pci/controller/dwc/Kconfig
> +++ b/drivers/pci/controller/dwc/Kconfig
> @@ -268,6 +268,7 @@ config PCIE_DW_PLAT_EP
> config PCIE_QCOM
> bool "Qualcomm PCIe controller (host mode)"
> depends on OF && (ARCH_QCOM || COMPILE_TEST)
> + depends on QCOM_COMMAND_DB || QCOM_COMMAND_DB=n
> depends on PCI_MSI
> select PCIE_DW_HOST
> select CRC8
> diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
> index 6a469ed213ce..c807833ee4a7 100644
> --- a/drivers/pci/controller/dwc/pcie-qcom.c
> +++ b/drivers/pci/controller/dwc/pcie-qcom.c
> @@ -30,13 +30,18 @@
> #include <linux/reset.h>
> #include <linux/slab.h>
> #include <linux/types.h>
> +#include <soc/qcom/cmd-db.h>
>
> #include "../../pci.h"
> #include "pcie-designware.h"
>
> +#include <dt-bindings/interconnect/qcom,icc.h>
> +#include <dt-bindings/interconnect/qcom,rpm-icc.h>
> +
> /* PARF registers */
> #define PARF_SYS_CTRL 0x00
> #define PARF_PM_CTRL 0x20
> +#define PARF_PM_STTS 0x24
> #define PARF_PCS_DEEMPH 0x34
> #define PARF_PCS_SWING 0x38
> #define PARF_PHY_CTRL 0x40
> @@ -80,7 +85,10 @@
> #define L1_CLK_RMV_DIS BIT(1)
>
> /* PARF_PM_CTRL register fields */
> -#define REQ_NOT_ENTR_L1 BIT(5)
> +#define REQ_NOT_ENTR_L1 BIT(5) /* "Prevent L0->L1" */
> +
> +/* PARF_PM_STTS register fields */
> +#define PM_ENTER_L23 BIT(5)
>
> /* PARF_PCS_DEEMPH register fields */
> #define PCS_DEEMPH_TX_DEEMPH_GEN1(x) FIELD_PREP(GENMASK(21, 16), x)
> @@ -122,6 +130,7 @@
>
> /* ELBI_SYS_CTRL register fields */
> #define ELBI_SYS_CTRL_LT_ENABLE BIT(0)
> +#define ELBI_SYS_CTRL_PME_TURNOFF_MSG BIT(4)
>
> /* AXI_MSTR_RESP_COMP_CTRL0 register fields */
> #define CFG_REMOTE_RD_REQ_BRIDGE_SIZE_2K 0x4
> @@ -243,6 +252,7 @@ struct qcom_pcie {
> const struct qcom_pcie_cfg *cfg;
> struct dentry *debugfs;
> bool suspended;
> + bool soc_is_rpmh;
> };
>
> #define to_qcom_pcie(x) dev_get_drvdata((x)->dev)
> @@ -272,6 +282,24 @@ static int qcom_pcie_start_link(struct dw_pcie *pci)
> return 0;
> }
>
> +static int qcom_pcie_stop_link(struct dw_pcie *pci)
> +{
> + struct qcom_pcie *pcie = to_qcom_pcie(pci);
> + u32 ret_l23, val;
> +
> + writel(ELBI_SYS_CTRL_PME_TURNOFF_MSG, pcie->elbi + ELBI_SYS_CTRL);
> + readl(pcie->elbi + ELBI_SYS_CTRL);
> +
> + ret_l23 = readl_poll_timeout(pcie->parf + PARF_PM_STTS, val,
> + val & PM_ENTER_L23, 10000, 100000);
> + if (ret_l23) {
> + dev_err(pci->dev, "Failed to enter L2/L3\n");
> + return -ETIMEDOUT;
> + }
> +
> + return 0;
> +}
> +
> static void qcom_pcie_clear_hpc(struct dw_pcie *pci)
> {
> u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
> @@ -987,9 +1015,19 @@ static void qcom_pcie_host_post_init_2_7_0(struct qcom_pcie *pcie)
> static void qcom_pcie_deinit_2_7_0(struct qcom_pcie *pcie)
> {
> struct qcom_pcie_resources_2_7_0 *res = &pcie->res.v2_7_0;
> + u32 val;
> +
> + /* Disable PCIe clocks and resets */
> + val = readl(pcie->parf + PARF_PHY_CTRL);
> + val |= PHY_TEST_PWR_DOWN;
> + writel(val, pcie->parf + PARF_PHY_CTRL);
> + readl(pcie->parf + PARF_PHY_CTRL);
>
> clk_bulk_disable_unprepare(res->num_clks, res->clks);
>
> + reset_control_assert(res->rst);
> + usleep_range(2000, 2500);
> +
> regulator_bulk_disable(ARRAY_SIZE(res->supplies), res->supplies);
> }
>
> @@ -1545,6 +1583,9 @@ static int qcom_pcie_probe(struct platform_device *pdev)
> goto err_phy_exit;
> }
>
> + /* If the soc features RPMh, cmd_db must have been prepared by now */
> + pcie->soc_is_rpmh = !cmd_db_ready();
> +
> qcom_pcie_icc_update(pcie);
>
> if (pcie->mhi)
> @@ -1561,58 +1602,108 @@ static int qcom_pcie_probe(struct platform_device *pdev)
> return ret;
> }
>
> -static int qcom_pcie_suspend_noirq(struct device *dev)
> +static int qcom_pcie_resume_noirq(struct device *dev)
> {
> struct qcom_pcie *pcie = dev_get_drvdata(dev);
> int ret;
>
> - /*
> - * Set minimum bandwidth required to keep data path functional during
> - * suspend.
> - */
> - ret = icc_set_bw(pcie->icc_mem, 0, kBps_to_icc(1));
> - if (ret) {
> - dev_err(dev, "Failed to set interconnect bandwidth: %d\n", ret);
> - return ret;
> + if (pcie->soc_is_rpmh) {
> + /*
> + * Undo the tag change from qcom_pcie_suspend_noirq first in
> + * case RPMh spontaneously decides to power collapse the
> + * platform based on other inputs.
> + */
> + icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_ALWAYS);
> +
> + /* Flush the tag change */
> + ret = icc_enable(pcie->icc_mem);
> + if (ret) {
> + dev_err(pcie->pci->dev, "failed to icc_enable: %d", ret);
> + return ret;
> + }
> }
>
> - /*
> - * Turn OFF the resources only for controllers without active PCIe
> - * devices. For controllers with active devices, the resources are kept
> - * ON and the link is expected to be in L0/L1 (sub)states.
> - *
> - * Turning OFF the resources for controllers with active PCIe devices
> - * will trigger access violation during the end of the suspend cycle,
> - * as kernel tries to access the PCIe devices config space for masking
> - * MSIs.
> - *
> - * Also, it is not desirable to put the link into L2/L3 state as that
> - * implies VDD supply will be removed and the devices may go into
> - * powerdown state. This will affect the lifetime of the storage devices
> - * like NVMe.
> - */
> - if (!dw_pcie_link_up(pcie->pci)) {
> - qcom_pcie_host_deinit(&pcie->pci->pp);
> - pcie->suspended = true;
> - }
> + /* Only check this now to make sure the icc tag has been set. */
> + if (!pcie->suspended)
> + return 0;
> +
> + ret = qcom_pcie_host_init(&pcie->pci->pp);
> + if (ret)
> + goto revert_icc_tag;
> +
> + dw_pcie_setup_rc(&pcie->pci->pp);
> +
> + ret = qcom_pcie_start_link(pcie->pci);
> + if (ret)
> + goto deinit_host;
> +
> + /* Ignore the retval, the devices may come up later. */
> + dw_pcie_wait_for_link(pcie->pci);
> +
> + qcom_pcie_icc_update(pcie);
> +
> + pcie->suspended = false;
>
> return 0;
> +
> +deinit_host:
> + qcom_pcie_host_deinit(&pcie->pci->pp);
> +revert_icc_tag:
> + if (pcie->soc_is_rpmh) {
> + icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_WAKE);
> +
> + /* Ignore the retval, failing here would be tragic anyway.. */
> + icc_enable(pcie->icc_mem);
> + }
> +
> + return ret;
> }
>
> -static int qcom_pcie_resume_noirq(struct device *dev)
> +static int qcom_pcie_suspend_noirq(struct device *dev)
> {
> struct qcom_pcie *pcie = dev_get_drvdata(dev);
> int ret;
>
> - if (pcie->suspended) {
> - ret = qcom_pcie_host_init(&pcie->pci->pp);
> + if (pcie->suspended)
> + return 0;
> +
> + if (dw_pcie_link_up(pcie->pci)) {
> + ret = qcom_pcie_stop_link(pcie->pci);
> if (ret)
> return ret;
> + }
>
> - pcie->suspended = false;
> + qcom_pcie_host_deinit(&pcie->pci->pp);
> +
> + if (pcie->soc_is_rpmh) {
> + /*
> + * The PCIe RC may be covertly accessed by the secure firmware
> + * on sleep exit. Use the WAKE bucket to let RPMh pull the plug
> + * on PCIe in sleep, but guarantee it comes back up for resume.
> + */
> + icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_WAKE);
> +
> + /* Flush the tag change */
> + ret = icc_enable(pcie->icc_mem);
> + if (ret) {
> + dev_err(pcie->pci->dev, "failed to icc_enable %d\n", ret);
> +
> + /* Revert everything and pray icc calls succeed */
> + return qcom_pcie_resume_noirq(dev);
> + }
> + } else {
> + /*
> + * Set minimum bandwidth required to keep data path functional
> + * during suspend.
> + */
calling qcom_pcie_host_deinit(&pcie->pci->pp) above will turn off all
the resources, setting BW to 1Kbps will not make sense here.
- Krishna Chaitanya.
> + ret = icc_set_bw(pcie->icc_mem, 0, kBps_to_icc(1));
> + if (ret) {
> + dev_err(dev, "Failed to set interconnect bandwidth: %d\n", ret);
> + return ret;
> + }
> }
>
> - qcom_pcie_icc_update(pcie);
> + pcie->suspended = true;
>
> return 0;
> }
>
On 2/16/24 07:52, Johan Hovold wrote:
> On Thu, Feb 15, 2024 at 07:44:27PM +0100, Konrad Dybcio wrote:
>> On 15.02.2024 17:11, Bjorn Helgaas wrote:
>>> On Thu, Feb 15, 2024 at 11:21:45AM +0100, Konrad Dybcio wrote:
>>>> On 14.02.2024 23:28, Bjorn Helgaas wrote:
>>>>> On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
>>>>>> On 12.02.2024 22:17, Bjorn Helgaas wrote:
>>>>>>> Maybe include the reason in the subject? "Read back" is literally
>>>>>>> what the diff says.
>>>>>>>
>>>>>>> On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
>>>>>>>> To ensure write completion, read the PARF_LTSSM register after setting
>>>>>>>> the LTSSM enable bit before polling for "link up".
>>>>>>>
>>>>>>> The write will obviously complete *some* time; I assume the point is
>>>>>>> that it's important for it to complete before some other event, and it
>>>>>>> would be nice to know why that's important.
>>>>>>
>>>>>> Right, that's very much meaningful on non-total-store-ordering
>>>>>> architectures, like arm64, where the CPU receives a store instruction,
>>>>>> but that does not necessarily impact the memory/MMIO state immediately.
>>>>>
>>>>> I was hinting that maybe we could say what the other event is, or what
>>>>> problem this solves? E.g., maybe it's as simple as "there's no point
>>>>> in polling for link up until after the PARF_LTSSM store completes."
>>>>>
>>>>> But while the read of PARF_LTSSM might reduce the number of "is the
>>>>> link up" polls, it probably wouldn't speed anything up otherwise, so I
>>>>> suspect there's an actual functional reason for this patch, and that's
>>>>> what I'm getting at.
>>>>
>>>> So, the register containing the "enable switch" (PARF_LTSSM) can (due
>>>> to the armv8 memory model) be "written" but not "change the value of
>>>> memory/mmio from the perspective of other (non-CPU) memory-readers
>>>> (such as the MMIO-mapped PCI controller itself)".
>>>>
>>>> In that case, the CPU will happily continue calling qcom_pcie_link_up()
>>>> in a loop, waiting for the PCIe controller to bring the link up, however
>>>> the PCIE controller may have never received the PARF_LTSSM "enable link"
>>>> write by the time we decide to time out on checking the link status.
>
> This makes no sense. As Bjorn already said, you're just polling for the
> link to come up (for a second). And unless you have something else that
> depends on the write to have reached the device, there is no need to
> read it back. It's not going to be cached indefinitely if that's what
> you fear.
The point is, if we know that the hardware is expected to return "done"
within the polling timeout value of receiving the request to do so, we
are actively taking away an unknown amount of time from that timeout.
So, if the polling condition becomes true after 980ms, but due to write
buffering the value reached the PCIe hardware after 21 ms, we're gonna
hit a timeout. Or under truly extreme circumstances, the polling may
time out before the write has even arrived at the PCIe hw.
>
>> Generally, it's a good idea to add such readbacks after all timing-
>> critical writes, especially when they concern asserting reset,
>> enabling/disabling power, etc., to make sure we're not assuming the
>> hardware state of a peripheral has changed before we ask it to do so.
>
> Again no, there is no general need to do that. It all depends on what
> the code does and how the device works.
Agreed it's not necessary *in general*, but as I pointed out, this is
an operation that we expect to complete within a set time frame, which
involves external hardware.
Konrad
On Fri, Mar 15, 2024 at 11:16:59AM +0100, Konrad Dybcio wrote:
>
>
> On 2/16/24 07:52, Johan Hovold wrote:
> > On Thu, Feb 15, 2024 at 07:44:27PM +0100, Konrad Dybcio wrote:
> > > On 15.02.2024 17:11, Bjorn Helgaas wrote:
> > > > On Thu, Feb 15, 2024 at 11:21:45AM +0100, Konrad Dybcio wrote:
> > > > > On 14.02.2024 23:28, Bjorn Helgaas wrote:
> > > > > > On Wed, Feb 14, 2024 at 10:35:16PM +0100, Konrad Dybcio wrote:
> > > > > > > On 12.02.2024 22:17, Bjorn Helgaas wrote:
> > > > > > > > Maybe include the reason in the subject? "Read back" is literally
> > > > > > > > what the diff says.
> > > > > > > >
> > > > > > > > On Sat, Feb 10, 2024 at 06:10:06PM +0100, Konrad Dybcio wrote:
> > > > > > > > > To ensure write completion, read the PARF_LTSSM register after setting
> > > > > > > > > the LTSSM enable bit before polling for "link up".
> > > > > > > >
> > > > > > > > The write will obviously complete *some* time; I assume the point is
> > > > > > > > that it's important for it to complete before some other event, and it
> > > > > > > > would be nice to know why that's important.
> > > > > > >
> > > > > > > Right, that's very much meaningful on non-total-store-ordering
> > > > > > > architectures, like arm64, where the CPU receives a store instruction,
> > > > > > > but that does not necessarily impact the memory/MMIO state immediately.
> > > > > >
> > > > > > I was hinting that maybe we could say what the other event is, or what
> > > > > > problem this solves? E.g., maybe it's as simple as "there's no point
> > > > > > in polling for link up until after the PARF_LTSSM store completes."
> > > > > >
> > > > > > But while the read of PARF_LTSSM might reduce the number of "is the
> > > > > > link up" polls, it probably wouldn't speed anything up otherwise, so I
> > > > > > suspect there's an actual functional reason for this patch, and that's
> > > > > > what I'm getting at.
> > > > >
> > > > > So, the register containing the "enable switch" (PARF_LTSSM) can (due
> > > > > to the armv8 memory model) be "written" but not "change the value of
> > > > > memory/mmio from the perspective of other (non-CPU) memory-readers
> > > > > (such as the MMIO-mapped PCI controller itself)".
> > > > >
> > > > > In that case, the CPU will happily continue calling qcom_pcie_link_up()
> > > > > in a loop, waiting for the PCIe controller to bring the link up, however
> > > > > the PCIE controller may have never received the PARF_LTSSM "enable link"
> > > > > write by the time we decide to time out on checking the link status.
> >
> > This makes no sense. As Bjorn already said, you're just polling for the
> > link to come up (for a second). And unless you have something else that
> > depends on the write to have reached the device, there is no need to
> > read it back. It's not going to be cached indefinitely if that's what
> > you fear.
>
> The point is, if we know that the hardware is expected to return "done"
> within the polling timeout value of receiving the request to do so, we
> are actively taking away an unknown amount of time from that timeout.
>
> So, if the polling condition becomes true after 980ms, but due to write
> buffering the value reached the PCIe hardware after 21 ms, we're gonna
> hit a timeout. Or under truly extreme circumstances, the polling may
> time out before the write has even arrived at the PCIe hw.
>
You should've mentioned the actual reason for doing the readback in the commit
message. That would've clarified the intention.
> >
> > > Generally, it's a good idea to add such readbacks after all timing-
> > > critical writes, especially when they concern asserting reset,
> > > enabling/disabling power, etc., to make sure we're not assuming the
> > > hardware state of a peripheral has changed before we ask it to do so.
> >
> > Again no, there is no general need to do that. It all depends on what
> > the code does and how the device works.
>
> Agreed it's not necessary *in general*, but as I pointed out, this is
> an operation that we expect to complete within a set time frame, which
> involves external hardware.
>
As I pointed out in the review of v1 series, LTSSM is in PARF register region
and the link status is in DBI region. Techinically both belongs to the PCIe
domain, but I am not 100% sure that both belong to the same hw domain or
different. So I cannot rule out the possibility that the first write may not
reach the hardware by the time link status is queried.
That's the reason I gave my R-b tag. But I need to confirm with the hw team
on this to be sure since this may be applicable to other drivers also.
- Mani
--
மணிவண்ணன் சதாசிவம்
On Fri, Mar 15, 2024 at 11:16:59AM +0100, Konrad Dybcio wrote:
> On 2/16/24 07:52, Johan Hovold wrote:
> > This makes no sense. As Bjorn already said, you're just polling for the
> > link to come up (for a second). And unless you have something else that
> > depends on the write to have reached the device, there is no need to
> > read it back. It's not going to be cached indefinitely if that's what
> > you fear.
>
> The point is, if we know that the hardware is expected to return "done"
> within the polling timeout value of receiving the request to do so, we
> are actively taking away an unknown amount of time from that timeout.
We're talking about microseconds, not milliseconds or seconds as you
seem to believe.
> So, if the polling condition becomes true after 980ms, but due to write
> buffering the value reached the PCIe hardware after 21 ms, we're gonna
> hit a timeout. Or under truly extreme circumstances, the polling may
> time out before the write has even arrived at the PCIe hw.
So the write latency is not an issue here.
Johan
On 20.02.2024 5:12 AM, Krishna Chaitanya Chundru wrote:
>
>
> On 2/10/2024 10:40 PM, Konrad Dybcio wrote:
>> Currently, we've only been minimizing the power draw while keeping the
>> RC up at all times. This is suboptimal, as it draws a whole lot of power
>> and prevents the SoC from power collapsing.
>>
>> Implement full shutdown and re-initialization to allow for powering off
>> the controller.
>>
>> This is mainly indended for SC8280XP with a broken power rail setup,
>> which requires a full RC shutdown/reinit in order to reach SoC-wide
>> power collapse, but sleeping is generally better than not sleeping and
>> less destructive suspend can be implemented later for platforms that
>> support it.
>>
>> Co-developed-by: Bjorn Andersson <[email protected]>
>> Signed-off-by: Konrad Dybcio <[email protected]>
>> ---
[...]
>> + if (pcie->soc_is_rpmh) {
>> + /*
>> + * The PCIe RC may be covertly accessed by the secure firmware
>> + * on sleep exit. Use the WAKE bucket to let RPMh pull the plug
>> + * on PCIe in sleep, but guarantee it comes back up for resume.
>> + */
>> + icc_set_tag(pcie->icc_mem, QCOM_ICC_TAG_WAKE);
>> +
>> + /* Flush the tag change */
>> + ret = icc_enable(pcie->icc_mem);
>> + if (ret) {
>> + dev_err(pcie->pci->dev, "failed to icc_enable %d\n", ret);
>> +
>> + /* Revert everything and pray icc calls succeed */
>> + return qcom_pcie_resume_noirq(dev);
>> + }
>> + } else {
>> + /*
>> + * Set minimum bandwidth required to keep data path functional
>> + * during suspend.
>> + */
> calling qcom_pcie_host_deinit(&pcie->pci->pp) above will turn off all the resources, setting BW to 1Kbps will not make sense here.
This is preserving the current behavior, it may be revised later.
See ad9b9b6e36c9 ("PCI: qcom: Add support for system suspend and resume")
that introduced it, in a perhaps overly 8280-centric fashion.
Konrad
On 27.03.2024 8:37 PM, Konrad Dybcio wrote:
> On 15.03.2024 5:47 PM, Johan Hovold wrote:
>> On Fri, Mar 15, 2024 at 11:16:59AM +0100, Konrad Dybcio wrote:
>>> On 2/16/24 07:52, Johan Hovold wrote:
>>
>>>> This makes no sense. As Bjorn already said, you're just polling for the
>>>> link to come up (for a second). And unless you have something else that
>>>> depends on the write to have reached the device, there is no need to
>>>> read it back. It's not going to be cached indefinitely if that's what
>>>> you fear.
>>>
>>> The point is, if we know that the hardware is expected to return "done"
>>> within the polling timeout value of receiving the request to do so, we
>>> are actively taking away an unknown amount of time from that timeout.
>>
>> We're talking about microseconds, not milliseconds or seconds as you
>> seem to believe.
>>
>>> So, if the polling condition becomes true after 980ms, but due to write
>>> buffering the value reached the PCIe hardware after 21 ms, we're gonna
>>> hit a timeout. Or under truly extreme circumstances, the polling may
>>> time out before the write has even arrived at the PCIe hw.
>>
>> So the write latency is not an issue here.
>
> Right, I'm willing to believe the CPU will kick the can down the road
> for this long. I'll drop this.
will not kick the can down the road for this long*
Konrad
On 15.03.2024 5:47 PM, Johan Hovold wrote:
> On Fri, Mar 15, 2024 at 11:16:59AM +0100, Konrad Dybcio wrote:
>> On 2/16/24 07:52, Johan Hovold wrote:
>
>>> This makes no sense. As Bjorn already said, you're just polling for the
>>> link to come up (for a second). And unless you have something else that
>>> depends on the write to have reached the device, there is no need to
>>> read it back. It's not going to be cached indefinitely if that's what
>>> you fear.
>>
>> The point is, if we know that the hardware is expected to return "done"
>> within the polling timeout value of receiving the request to do so, we
>> are actively taking away an unknown amount of time from that timeout.
>
> We're talking about microseconds, not milliseconds or seconds as you
> seem to believe.
>
>> So, if the polling condition becomes true after 980ms, but due to write
>> buffering the value reached the PCIe hardware after 21 ms, we're gonna
>> hit a timeout. Or under truly extreme circumstances, the polling may
>> time out before the write has even arrived at the PCIe hw.
>
> So the write latency is not an issue here.
Right, I'm willing to believe the CPU will kick the can down the road
for this long. I'll drop this.
Konrad