There are large number of QSPI irqs that fire during boot/init and later
on every suspend/resume.
This could be made faster by doing DMA instead of PIO.
Below is comparison for number of interrupts raised in 2 acenarios...
Boot up and stabilise
Suspend/Resume
Sequence PIO DMA
=======================
Boot-up 69088 19284
S/R 5066 3430
Vijaya Krishna Nivarthi (2):
arm64: dts: qcom: sc7280: Add stream-id of qspi to iommus
spi: spi-qcom-qspi: Add DMA mode support
arch/arm64/boot/dts/qcom/sc7280.dtsi | 1 +
drivers/spi/spi-qcom-qspi.c | 429 ++++++++++++++++++++++++++++++++---
2 files changed, 399 insertions(+), 31 deletions(-)
--
Qualcomm INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by the Linux Foundation.
This is done as part of adding DMA support to qspi driver.
Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
---
arch/arm64/boot/dts/qcom/sc7280.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi
index 5e6f9f4..9e05285 100644
--- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
@@ -3434,6 +3434,7 @@
qspi: spi@88dc000 {
compatible = "qcom,sc7280-qspi", "qcom,qspi-v1";
reg = <0 0x088dc000 0 0x1000>;
+ iommus = <&apps_smmu 0x20 0x0>;
#address-cells = <1>;
#size-cells = <0>;
interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
--
Qualcomm INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by the Linux Foundation.
For performance improvement in terms of number of interrupts.
Code flow for DMA mode...
Allocate for DMA pools in probe()
For xfers with len > 64 bytes, return true from can_dma()
Thus framework creates and maps an sg table for each xfer buffer.
In transfer_one() enable DMA in MSTR_CONFIG.
Split the dma buffer of each entry of sgt into a maximum of 3 parts...
Unaligned head, aligned middle and tail.
For each part create a dma_cmd_descriptor.
For head and tail allocate dma_data_descriptors
For tx, copy the data into allocated data descriptors
For rx, remember the original buffers to copy from-
allocated descriptors after xfer is complete
For middle part use the aligned dma buffer.
Link each dma_cmd_descriptor to next.
Thus create a chain of descriptors.
Kick-off the xfer by copying 1st cmd descriptor to-
NEXT_DMA_DESC_ADDR register.
On receiving DMA_CHAIN_DONE interrupt, complete the xfer and-
free descriptors.
If timeout happens handle error by freeing descriptors.
In remove() free DMA pools.
Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
---
drivers/spi/spi-qcom-qspi.c | 429 ++++++++++++++++++++++++++++++++++++++++----
1 file changed, 398 insertions(+), 31 deletions(-)
diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
index fab1553..64c3bec 100644
--- a/drivers/spi/spi-qcom-qspi.c
+++ b/drivers/spi/spi-qcom-qspi.c
@@ -13,7 +13,8 @@
#include <linux/pm_opp.h>
#include <linux/spi/spi.h>
#include <linux/spi/spi-mem.h>
-
+#include <linux/dmapool.h>
+#include <linux/dma-mapping.h>
#define QSPI_NUM_CS 2
#define QSPI_BYTES_PER_WORD 4
@@ -62,6 +63,7 @@
#define WR_FIFO_FULL BIT(10)
#define WR_FIFO_OVERRUN BIT(11)
#define TRANSACTION_DONE BIT(16)
+#define DMA_CHAIN_DONE BIT(31)
#define QSPI_ERR_IRQS (RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
WR_FIFO_OVERRUN)
#define QSPI_ALL_IRQS (QSPI_ERR_IRQS | RESP_FIFO_RDY | \
@@ -108,6 +110,10 @@
#define RD_FIFO_RESET 0x0030
#define RESET_FIFO BIT(0)
+#define NEXT_DMA_DESC_ADDR 0x0040
+#define CURRENT_DMA_DESC_ADDR 0x0044
+#define CURRENT_MEM_ADDR 0x0048
+
#define CUR_MEM_ADDR 0x0048
#define HW_VERSION 0x004c
#define RD_FIFO 0x0050
@@ -120,6 +126,22 @@ enum qspi_dir {
QSPI_WRITE,
};
+struct qspi_cmd_desc {
+ uint32_t data_address;
+ uint32_t next_descriptor;
+ uint32_t direction:1;
+ uint32_t multi_io_mode:3;
+ uint32_t reserved1:4;
+ uint32_t fragment:1;
+ uint32_t reserved2:7;
+ uint32_t length:16;
+ //------------------------//
+ uint8_t *bounce_src;
+ uint8_t *bounce_dst;
+ uint32_t bounce_length;
+};
+
+#define QSPI_MAX_NUM_DESC 5
struct qspi_xfer {
union {
const void *tx_buf;
@@ -137,11 +159,30 @@ enum qspi_clocks {
QSPI_NUM_CLKS
};
+enum qspi_xfer_mode {
+ QSPI_INVALID,
+ QSPI_FIFO,
+ QSPI_DMA
+};
+
+/* number of entries in sgt returned from spi framework that we can support */
+#define QSPI_QCOM_MAX_SG 5
+
struct qcom_qspi {
void __iomem *base;
struct device *dev;
struct clk_bulk_data *clks;
struct qspi_xfer xfer;
+ struct dma_pool *dma_cmd_pool;
+ struct dma_pool *dma_data_pool;
+ dma_addr_t dma_cmd_desc[3*QSPI_QCOM_MAX_SG];
+ dma_addr_t dma_data_desc[2*QSPI_QCOM_MAX_SG];
+ void *virt_cmd_desc[3*QSPI_QCOM_MAX_SG];
+ void *virt_data_desc[2*QSPI_QCOM_MAX_SG];
+ unsigned int n_cmd_desc;
+ unsigned int n_data_desc;
+ int xfer_mode;
+ u32 iomode;
struct icc_path *icc_path_cpu_to_qspi;
unsigned long last_speed;
/* Lock to protect data accessed by IRQs */
@@ -151,18 +192,25 @@ struct qcom_qspi {
static u32 qspi_buswidth_to_iomode(struct qcom_qspi *ctrl,
unsigned int buswidth)
{
+ u32 ret;
+
+ /* for DMA we dont write to PIO_XFER_CFG register, so don't shift */
switch (buswidth) {
case 1:
- return SDR_1BIT << MULTI_IO_MODE_SHFT;
+ ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
+ break;
case 2:
- return SDR_2BIT << MULTI_IO_MODE_SHFT;
+ ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_2BIT : SDR_2BIT << MULTI_IO_MODE_SHFT);
+ break;
case 4:
- return SDR_4BIT << MULTI_IO_MODE_SHFT;
+ ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_4BIT : SDR_4BIT << MULTI_IO_MODE_SHFT);
+ break;
default:
dev_warn_once(ctrl->dev,
"Unexpected bus width: %u\n", buswidth);
- return SDR_1BIT << MULTI_IO_MODE_SHFT;
+ ret = (ctrl->xfer_mode == QSPI_DMA ? SDR_1BIT : SDR_1BIT << MULTI_IO_MODE_SHFT);
}
+ return ret;
}
static void qcom_qspi_pio_xfer_cfg(struct qcom_qspi *ctrl)
@@ -223,6 +271,21 @@ static void qcom_qspi_handle_err(struct spi_master *master,
spin_lock_irqsave(&ctrl->lock, flags);
writel(0, ctrl->base + MSTR_INT_EN);
ctrl->xfer.rem_bytes = 0;
+
+ if (ctrl->xfer_mode == QSPI_DMA) {
+ int ii;
+
+ /* free cmd and data descriptors */
+ for (ii = 0; ii < ctrl->n_data_desc; ii++)
+ dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[ii],
+ ctrl->dma_data_desc[ii]);
+ ctrl->n_data_desc = 0;
+
+ for (ii = 0; ii < ctrl->n_cmd_desc; ii++)
+ dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[ii],
+ ctrl->dma_cmd_desc[ii]);
+ ctrl->n_cmd_desc = 0;
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
}
@@ -230,6 +293,7 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
{
int ret;
unsigned int avg_bw_cpu;
+ unsigned int peak_bw_cpu;
if (speed_hz == ctrl->last_speed)
return 0;
@@ -241,12 +305,16 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
return ret;
}
+ avg_bw_cpu = Bps_to_icc(speed_hz);
/*
- * Set BW quota for CPU as driver supports FIFO mode only.
- * We don't have explicit peak requirement so keep it equal to avg_bw.
+ * Set BW quota for CPU for FIFO to avg_bw
+ * as we don't have explicit peak requirement.
+ * TBD TBD TBD - change this as required for DMA.
+ * As of now same peak requirement seems to be working.
*/
- avg_bw_cpu = Bps_to_icc(speed_hz);
- ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, avg_bw_cpu);
+ peak_bw_cpu = ctrl->xfer_mode == QSPI_FIFO ? avg_bw_cpu : avg_bw_cpu;
+
+ ret = icc_set_bw(ctrl->icc_path_cpu_to_qspi, avg_bw_cpu, peak_bw_cpu);
if (ret) {
dev_err(ctrl->dev, "%s: ICC BW voting failed for cpu: %d\n",
__func__, ret);
@@ -258,6 +326,186 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
return 0;
}
+/* aligned to 32 bytes, not to cross 1KB boundary */
+#define QSPI_ALIGN_REQ 32
+#define QSPI_BOUNDARY_REQ 1024
+
+int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
+ dma_addr_t dma_ptr, uint32_t n_bytes)
+{
+ struct qspi_cmd_desc *virt_cmd_desc, *prev;
+ uint8_t *virt_data_desc;
+ dma_addr_t dma_cmd_desc, dma_data_desc;
+
+ if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
+ dev_err(ctrl->dev,
+ "Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
+ return -ENOMEM;
+ }
+
+ /* allocate for dma cmd descriptor */
+ virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
+ GFP_KERNEL, &dma_cmd_desc);
+ if (!virt_cmd_desc) {
+ dev_err(ctrl->dev,
+ "Could not allocate for cmd_desc\n");
+ return -ENOMEM;
+ }
+ ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
+ ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
+ ctrl->n_cmd_desc++;
+
+ /* allocate for dma data descriptor if unaligned else use pre-aligned */
+ if (virt_ptr) {
+ virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
+ GFP_KERNEL, &dma_data_desc);
+ if (!virt_data_desc) {
+ dev_err(ctrl->dev,
+ "Could not allocate for data_desc\n");
+ return -ENOMEM;
+ }
+ ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
+ ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
+ ctrl->n_data_desc++;
+
+ /*
+ * for tx copy xfer data into allocated buffer
+ * for rx setup bounce info to copy after xfer
+ */
+ if (ctrl->xfer.dir == QSPI_WRITE) {
+ memcpy(virt_data_desc, virt_ptr, n_bytes);
+ } else {
+ virt_cmd_desc->bounce_src = virt_data_desc;
+ virt_cmd_desc->bounce_dst = virt_ptr;
+ virt_cmd_desc->bounce_length = n_bytes;
+ }
+ } else {
+ dma_data_desc = dma_ptr;
+ }
+
+ /* setup cmd descriptor */
+ virt_cmd_desc->data_address = (uint32_t)(uintptr_t)(dma_data_desc);
+ virt_cmd_desc->next_descriptor = 0;
+ virt_cmd_desc->direction = ctrl->xfer.dir;
+ virt_cmd_desc->multi_io_mode = ctrl->iomode;
+ virt_cmd_desc->reserved1 = 0;
+ virt_cmd_desc->fragment = 0;
+ virt_cmd_desc->reserved2 = 0;
+ virt_cmd_desc->length = n_bytes;
+
+ /* update previous descriptor */
+ if (ctrl->n_cmd_desc >= 2) {
+ prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
+ prev->next_descriptor = dma_cmd_desc;
+ prev->fragment = 1;
+ }
+
+ return 0;
+}
+
+static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
+ struct spi_transfer *xfer)
+{
+ int ret;
+ struct sg_table *sgt;
+ unsigned int sg_total_len = 0;
+ dma_addr_t dma_ptr_sg;
+ unsigned int dma_len_sg;
+ uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
+ dma_addr_t aligned_ptr;
+ int ii;
+ uint8_t *byte_ptr;
+
+ if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
+ dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
+ ctrl->n_cmd_desc, ctrl->n_data_desc);
+ return -EIO;
+ }
+
+ sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
+ if (!sgt->nents || sgt->nents > QSPI_QCOM_MAX_SG) {
+ dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
+ return -EINVAL;
+ }
+
+ for (ii = 0; ii < sgt->nents; ii++)
+ sg_total_len += sg_dma_len(sgt->sgl + ii);
+ if (sg_total_len != xfer->len) {
+ dev_err(ctrl->dev, "Data lengths mismatch\n");
+ return -EINVAL;
+ }
+
+ if (ctrl->xfer.dir == QSPI_READ)
+ byte_ptr = (uint8_t *)xfer->rx_buf;
+ else
+ byte_ptr = (uint8_t *)xfer->tx_buf;
+
+ for (ii = 0; ii < sgt->nents; ii++) {
+ dma_ptr_sg = sg_dma_address(sgt->sgl + ii);
+ dma_len_sg = sg_dma_len(sgt->sgl + ii);
+
+ aligned_ptr = PTR_ALIGN(dma_ptr_sg, QSPI_ALIGN_REQ);
+
+ prolog_bytes = min(dma_len_sg, (unsigned int)(aligned_ptr - dma_ptr_sg));
+ if (prolog_bytes) {
+ ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, prolog_bytes);
+ if (ret)
+ goto cleanup;
+ byte_ptr += prolog_bytes;
+ }
+
+ aligned_bytes = PTR_ALIGN_DOWN(dma_len_sg - prolog_bytes, QSPI_ALIGN_REQ);
+ if (aligned_bytes) {
+ ret = qcom_qspi_alloc_desc(ctrl, 0, aligned_ptr, aligned_bytes);
+ if (ret)
+ goto cleanup;
+ byte_ptr += aligned_bytes;
+ }
+
+ epilog_bytes = dma_len_sg - prolog_bytes - aligned_bytes;
+ if (epilog_bytes) {
+ ret = qcom_qspi_alloc_desc(ctrl, byte_ptr, 0, epilog_bytes);
+ if (ret)
+ goto cleanup;
+ byte_ptr += epilog_bytes;
+ }
+ }
+ return 0;
+
+cleanup:
+ dev_err(ctrl->dev, "ERROR cleanup in setup_dma_desc\n");
+ for (ii = 0; ii < ctrl->n_data_desc; ii++)
+ dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[ii],
+ ctrl->dma_data_desc[ii]);
+ ctrl->n_data_desc = 0;
+
+ for (ii = 0; ii < ctrl->n_cmd_desc; ii++)
+ dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[ii],
+ ctrl->dma_cmd_desc[ii]);
+ ctrl->n_cmd_desc = 0;
+ return ret;
+}
+
+static void qcom_qspi_dma_xfer(struct qcom_qspi *ctrl)
+{
+ /* Ack any previous interrupts that might be hanging around */
+ writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_STATUS);
+
+ /* Setup new interrupts */
+ writel(DMA_CHAIN_DONE, ctrl->base + MSTR_INT_EN);
+
+ /* flush all writes */
+ wmb();
+
+ /* kick off transfer */
+ writel((uint32_t)(uintptr_t)(ctrl->dma_cmd_desc)[0], ctrl->base + NEXT_DMA_DESC_ADDR);
+}
+
+/* Switch to DMA if transfer length exceeds this */
+#define QSPI_MAX_BYTES_FIFO 64
+#define NO_TX_DATA_DELAY_FOR_DMA 1
+#define DMA_CONDITIONAL (xfer->len > QSPI_MAX_BYTES_FIFO)
+
static int qcom_qspi_transfer_one(struct spi_master *master,
struct spi_device *slv,
struct spi_transfer *xfer)
@@ -266,6 +514,7 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
int ret;
unsigned long speed_hz;
unsigned long flags;
+ u32 mstr_cfg;
speed_hz = slv->max_speed_hz;
if (xfer->speed_hz)
@@ -276,6 +525,7 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
return ret;
spin_lock_irqsave(&ctrl->lock, flags);
+ mstr_cfg = readl(ctrl->base + MSTR_CONFIG);
/* We are half duplex, so either rx or tx will be set */
if (xfer->rx_buf) {
@@ -290,7 +540,31 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
ctrl->xfer.is_last = list_is_last(&xfer->transfer_list,
&master->cur_msg->transfers);
ctrl->xfer.rem_bytes = xfer->len;
- qcom_qspi_pio_xfer(ctrl);
+
+ if (DMA_CONDITIONAL) {
+ ctrl->xfer_mode = QSPI_DMA;
+ ctrl->iomode = qspi_buswidth_to_iomode(ctrl, ctrl->xfer.buswidth);
+#if NO_TX_DATA_DELAY_FOR_DMA
+ mstr_cfg &= ~(TX_DATA_OE_DELAY_MSK | TX_DATA_DELAY_MSK);
+#endif
+ mstr_cfg &= ~FB_CLK_EN;
+ mstr_cfg &= ~PIN_WPN;
+ mstr_cfg |= DMA_ENABLE;
+ writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+
+ ret = qcom_qspi_setup_dma_desc(ctrl, xfer);
+ if (ret) {
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+ return ret;
+ }
+ qcom_qspi_dma_xfer(ctrl);
+ } else {
+ ctrl->xfer_mode = QSPI_FIFO;
+ mstr_cfg &= ~DMA_ENABLE;
+ mstr_cfg |= FB_CLK_EN | PIN_WPN;
+ writel(mstr_cfg, ctrl->base + MSTR_CONFIG);
+ qcom_qspi_pio_xfer(ctrl);
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
@@ -298,6 +572,15 @@ static int qcom_qspi_transfer_one(struct spi_master *master,
return 1;
}
+static bool qcom_qspi_can_dma(struct spi_controller *ctlr,
+ struct spi_device *slv, struct spi_transfer *xfer)
+{
+ if (DMA_CONDITIONAL)
+ return true;
+ else
+ return false;
+}
+
static int qcom_qspi_prepare_message(struct spi_master *master,
struct spi_message *message)
{
@@ -328,6 +611,40 @@ static int qcom_qspi_prepare_message(struct spi_master *master,
return 0;
}
+static int qcom_qspi_alloc_dma(struct qcom_qspi *ctrl)
+{
+ /* allocate for cmd descriptors pool */
+ ctrl->dma_cmd_pool = dma_pool_create("qspi cmd desc pool",
+ ctrl->dev, sizeof(struct qspi_cmd_desc), 0, 0);
+ if (!ctrl->dma_cmd_pool) {
+ dev_err(ctrl->dev, "Could not create dma pool for cmd_desc\n");
+ return -ENOMEM;
+ }
+
+ /*
+ * allocate for data descriptors pool as per alignment
+ * and boundary requirements
+ */
+ ctrl->dma_data_pool = dma_pool_create("qspi data desc pool",
+ ctrl->dev, QSPI_ALIGN_REQ, QSPI_ALIGN_REQ, QSPI_BOUNDARY_REQ);
+ if (!ctrl->dma_data_pool) {
+ dev_err(ctrl->dev, "Could not create dma pool for data desc\n");
+ dma_pool_destroy(ctrl->dma_cmd_pool);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void qcom_qspi_free_dma(struct qcom_qspi *ctrl)
+{
+ /* free pool buffers */
+ dma_pool_destroy(ctrl->dma_data_pool);
+
+ /* free pool */
+ dma_pool_destroy(ctrl->dma_cmd_pool);
+}
+
static irqreturn_t pio_read(struct qcom_qspi *ctrl)
{
u32 rd_fifo_status;
@@ -426,27 +743,58 @@ static irqreturn_t qcom_qspi_irq(int irq, void *dev_id)
int_status = readl(ctrl->base + MSTR_INT_STATUS);
writel(int_status, ctrl->base + MSTR_INT_STATUS);
- if (ctrl->xfer.dir == QSPI_WRITE) {
- if (int_status & WR_FIFO_EMPTY)
- ret = pio_write(ctrl);
- } else {
- if (int_status & RESP_FIFO_RDY)
- ret = pio_read(ctrl);
- }
-
- if (int_status & QSPI_ERR_IRQS) {
- if (int_status & RESP_FIFO_UNDERRUN)
- dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
- if (int_status & WR_FIFO_OVERRUN)
- dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
- if (int_status & HRESP_FROM_NOC_ERR)
- dev_err(ctrl->dev, "IRQ error: NOC response error\n");
- ret = IRQ_HANDLED;
- }
-
- if (!ctrl->xfer.rem_bytes) {
- writel(0, ctrl->base + MSTR_INT_EN);
- spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+ if (ctrl->xfer_mode == QSPI_FIFO) {
+ if (ctrl->xfer.dir == QSPI_WRITE) {
+ if (int_status & WR_FIFO_EMPTY)
+ ret = pio_write(ctrl);
+ } else {
+ if (int_status & RESP_FIFO_RDY)
+ ret = pio_read(ctrl);
+ }
+
+ if (int_status & QSPI_ERR_IRQS) {
+ if (int_status & RESP_FIFO_UNDERRUN)
+ dev_err(ctrl->dev, "IRQ error: FIFO underrun\n");
+ if (int_status & WR_FIFO_OVERRUN)
+ dev_err(ctrl->dev, "IRQ error: FIFO overrun\n");
+ if (int_status & HRESP_FROM_NOC_ERR)
+ dev_err(ctrl->dev, "IRQ error: NOC response error\n");
+ ret = IRQ_HANDLED;
+ }
+
+ if (!ctrl->xfer.rem_bytes) {
+ writel(0, ctrl->base + MSTR_INT_EN);
+ spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+ }
+ } else if (ctrl->xfer_mode == QSPI_DMA) {
+ if (int_status & DMA_CHAIN_DONE) {
+ int ii;
+
+ writel(0, ctrl->base + MSTR_INT_EN);
+
+ if (ctrl->xfer.dir == QSPI_READ) {
+ struct qspi_cmd_desc *cmd_desc;
+
+ for (ii = 0; ii < ctrl->n_cmd_desc; ii++) {
+ cmd_desc = (struct qspi_cmd_desc *)ctrl->virt_cmd_desc[ii];
+ memcpy(cmd_desc->bounce_dst,
+ cmd_desc->bounce_src, cmd_desc->bounce_length);
+ }
+ }
+
+ for (ii = 0; ii < ctrl->n_data_desc; ii++)
+ dma_pool_free(ctrl->dma_data_pool, ctrl->virt_data_desc[ii],
+ ctrl->dma_data_desc[ii]);
+ ctrl->n_data_desc = 0;
+
+ for (ii = 0; ii < ctrl->n_cmd_desc; ii++)
+ dma_pool_free(ctrl->dma_cmd_pool, ctrl->virt_cmd_desc[ii],
+ ctrl->dma_cmd_desc[ii]);
+ ctrl->n_cmd_desc = 0;
+
+ ret = IRQ_HANDLED;
+ spi_finalize_current_transfer(dev_get_drvdata(ctrl->dev));
+ }
}
spin_unlock(&ctrl->lock);
@@ -487,6 +835,9 @@ static int qcom_qspi_probe(struct platform_device *pdev)
if (ret)
return ret;
+ /* set default mode to FIFO */
+ ctrl->xfer_mode = QSPI_FIFO;
+
ctrl->icc_path_cpu_to_qspi = devm_of_icc_get(dev, "qspi-config");
if (IS_ERR(ctrl->icc_path_cpu_to_qspi))
return dev_err_probe(dev, PTR_ERR(ctrl->icc_path_cpu_to_qspi),
@@ -517,7 +868,12 @@ static int qcom_qspi_probe(struct platform_device *pdev)
return ret;
}
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret)
+ return dev_err_probe(dev, ret, "could not set DMA mask\n");
+
master->max_speed_hz = 300000000;
+ master->max_dma_len = 65536; /* as per HPG */
master->num_chipselect = QSPI_NUM_CS;
master->bus_num = -1;
master->dev.of_node = pdev->dev.of_node;
@@ -528,6 +884,7 @@ static int qcom_qspi_probe(struct platform_device *pdev)
master->prepare_message = qcom_qspi_prepare_message;
master->transfer_one = qcom_qspi_transfer_one;
master->handle_err = qcom_qspi_handle_err;
+ master->can_dma = qcom_qspi_can_dma;
master->auto_runtime_pm = true;
ret = devm_pm_opp_set_clkname(&pdev->dev, "core");
@@ -540,6 +897,11 @@ static int qcom_qspi_probe(struct platform_device *pdev)
return ret;
}
+ /* allocate for DMA descriptor pools */
+ ret = qcom_qspi_alloc_dma(ctrl);
+ if (ret)
+ return ret;
+
pm_runtime_use_autosuspend(dev);
pm_runtime_set_autosuspend_delay(dev, 250);
pm_runtime_enable(dev);
@@ -556,10 +918,15 @@ static int qcom_qspi_probe(struct platform_device *pdev)
static void qcom_qspi_remove(struct platform_device *pdev)
{
struct spi_master *master = platform_get_drvdata(pdev);
+ struct qcom_qspi *ctrl;
+
+ ctrl = spi_master_get_devdata(master);
/* Unregister _before_ disabling pm_runtime() so we stop transfers */
spi_unregister_master(master);
+ qcom_qspi_free_dma(ctrl);
+
pm_runtime_disable(&pdev->dev);
}
--
Qualcomm INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by the Linux Foundation.
On Tue, Apr 04, 2023 at 11:33:20PM +0530, Vijaya Krishna Nivarthi wrote:
A few quick review comments, mostly coding style though.
> +struct qspi_cmd_desc {
> + uint32_t data_address;
> + uint32_t next_descriptor;
> + uint32_t direction:1;
> + uint32_t multi_io_mode:3;
> + uint32_t reserved1:4;
> + uint32_t fragment:1;
> + uint32_t reserved2:7;
> + uint32_t length:16;
> + //------------------------//
What does this mean?
> + uint8_t *bounce_src;
> + uint8_t *bounce_dst;
> + uint32_t bounce_length;
> +};
> +
> +#define QSPI_MAX_NUM_DESC 5
> struct qspi_xfer {
Missing blank line after the define...
> + for (ii = 0; ii < sgt->nents; ii++)
> + sg_total_len += sg_dma_len(sgt->sgl + ii);
Why are we calling the iterator ii here?
> + if (ctrl->xfer.dir == QSPI_READ)
> + byte_ptr = (uint8_t *)xfer->rx_buf;
> + else
> + byte_ptr = (uint8_t *)xfer->tx_buf;
If we need to cast to or from void * there's some sort of problem.
> +/* Switch to DMA if transfer length exceeds this */
> +#define QSPI_MAX_BYTES_FIFO 64
> +#define NO_TX_DATA_DELAY_FOR_DMA 1
> +#define DMA_CONDITIONAL (xfer->len > QSPI_MAX_BYTES_FIFO)
> +
DMA_CONDITIONAL absolutely should not be a define, this just makes
things harder to read. Just have everything call can_dma() when needed.
> +#if NO_TX_DATA_DELAY_FOR_DMA
> + mstr_cfg &= ~(TX_DATA_OE_DELAY_MSK | TX_DATA_DELAY_MSK);
> +#endif
Why would we add extra delays if we don't need them, might someone set
this and if so when?
> + if (ctrl->xfer_mode == QSPI_FIFO) {
> + } else if (ctrl->xfer_mode == QSPI_DMA) {
> }
This should be a switch statement.
On 04/04/2023 20:03, Vijaya Krishna Nivarthi wrote:
> This is done as part of adding DMA support to qspi driver.
>
> Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
> ---
> arch/arm64/boot/dts/qcom/sc7280.dtsi | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> index 5e6f9f4..9e05285 100644
> --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> @@ -3434,6 +3434,7 @@
> qspi: spi@88dc000 {
> compatible = "qcom,sc7280-qspi", "qcom,qspi-v1";
> reg = <0 0x088dc000 0 0x1000>;
> + iommus = <&apps_smmu 0x20 0x0>
Does not look like you tested the DTS against bindings. Please run `make
dtbs_check` (see Documentation/devicetree/bindings/writing-schema.rst
for instructions).
Best regards,
Krzysztof
Quoting Vijaya Krishna Nivarthi (2023-04-04 11:03:18)
> There are large number of QSPI irqs that fire during boot/init and later
> on every suspend/resume.
> This could be made faster by doing DMA instead of PIO.
> Below is comparison for number of interrupts raised in 2 acenarios...
> Boot up and stabilise
> Suspend/Resume
>
> Sequence PIO DMA
> =======================
> Boot-up 69088 19284
> S/R 5066 3430
>
The interrupts are less, yes, but does it save time or power?
Quoting Vijaya Krishna Nivarthi (2023-04-04 11:03:20)
> For performance improvement in terms of number of interrupts.
> Code flow for DMA mode...
> Allocate for DMA pools in probe()
> For xfers with len > 64 bytes, return true from can_dma()
> Thus framework creates and maps an sg table for each xfer buffer.
> In transfer_one() enable DMA in MSTR_CONFIG.
> Split the dma buffer of each entry of sgt into a maximum of 3 parts...
> Unaligned head, aligned middle and tail.
> For each part create a dma_cmd_descriptor.
> For head and tail allocate dma_data_descriptors
> For tx, copy the data into allocated data descriptors
> For rx, remember the original buffers to copy from-
> allocated descriptors after xfer is complete
> For middle part use the aligned dma buffer.
> Link each dma_cmd_descriptor to next.
> Thus create a chain of descriptors.
> Kick-off the xfer by copying 1st cmd descriptor to-
> NEXT_DMA_DESC_ADDR register.
> On receiving DMA_CHAIN_DONE interrupt, complete the xfer and-
> free descriptors.
> If timeout happens handle error by freeing descriptors.
> In remove() free DMA pools.
>
Write a real commit text, and not psuedo-code for the patch. It should
sell us on wanting to review the patch, and help us understand the
importance of the change. The "how" comes from reading the patch
and code itself.
> Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
> ---
> drivers/spi/spi-qcom-qspi.c | 429 ++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 398 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
> index fab1553..64c3bec 100644
> --- a/drivers/spi/spi-qcom-qspi.c
> +++ b/drivers/spi/spi-qcom-qspi.c
> @@ -13,7 +13,8 @@
> #include <linux/pm_opp.h>
> #include <linux/spi/spi.h>
> #include <linux/spi/spi-mem.h>
> -
> +#include <linux/dmapool.h>
> +#include <linux/dma-mapping.h>
Sort this alphabetically like all the other headers here.
>
> #define QSPI_NUM_CS 2
> #define QSPI_BYTES_PER_WORD 4
> @@ -62,6 +63,7 @@
> #define WR_FIFO_FULL BIT(10)
> #define WR_FIFO_OVERRUN BIT(11)
> #define TRANSACTION_DONE BIT(16)
> +#define DMA_CHAIN_DONE BIT(31)
> #define QSPI_ERR_IRQS (RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
> WR_FIFO_OVERRUN)
> #define QSPI_ALL_IRQS (QSPI_ERR_IRQS | RESP_FIFO_RDY | \
> @@ -108,6 +110,10 @@
> #define RD_FIFO_RESET 0x0030
> #define RESET_FIFO BIT(0)
>
> +#define NEXT_DMA_DESC_ADDR 0x0040
> +#define CURRENT_DMA_DESC_ADDR 0x0044
> +#define CURRENT_MEM_ADDR 0x0048
> +
> #define CUR_MEM_ADDR 0x0048
> #define HW_VERSION 0x004c
> #define RD_FIFO 0x0050
> @@ -120,6 +126,22 @@ enum qspi_dir {
> QSPI_WRITE,
> };
>
> +struct qspi_cmd_desc {
> + uint32_t data_address;
Use u32/u8 instead of uint32_t/uint8_t
> + uint32_t next_descriptor;
> + uint32_t direction:1;
> + uint32_t multi_io_mode:3;
> + uint32_t reserved1:4;
> + uint32_t fragment:1;
> + uint32_t reserved2:7;
> + uint32_t length:16;
> + //------------------------//
> + uint8_t *bounce_src;
> + uint8_t *bounce_dst;
> + uint32_t bounce_length;
> +};
> +
> +#define QSPI_MAX_NUM_DESC 5
> struct qspi_xfer {
> union {
> const void *tx_buf;
> @@ -137,11 +159,30 @@ enum qspi_clocks {
> QSPI_NUM_CLKS
> };
>
> +enum qspi_xfer_mode {
> + QSPI_INVALID,
> + QSPI_FIFO,
> + QSPI_DMA
> +};
> +
> +/* number of entries in sgt returned from spi framework that we can support */
> +#define QSPI_QCOM_MAX_SG 5
> +
> struct qcom_qspi {
> void __iomem *base;
> struct device *dev;
> struct clk_bulk_data *clks;
> struct qspi_xfer xfer;
> + struct dma_pool *dma_cmd_pool;
> + struct dma_pool *dma_data_pool;
> + dma_addr_t dma_cmd_desc[3*QSPI_QCOM_MAX_SG];
Make a define for 3 and 2 and then a define for each equation?
> + dma_addr_t dma_data_desc[2*QSPI_QCOM_MAX_SG];
> + void *virt_cmd_desc[3*QSPI_QCOM_MAX_SG];
> + void *virt_data_desc[2*QSPI_QCOM_MAX_SG];
> + unsigned int n_cmd_desc;
> + unsigned int n_data_desc;
> + int xfer_mode;
> + u32 iomode;
Good use of u32
> struct icc_path *icc_path_cpu_to_qspi;
> unsigned long last_speed;
> /* Lock to protect data accessed by IRQs */
> @@ -258,6 +326,186 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
> return 0;
> }
>
> +/* aligned to 32 bytes, not to cross 1KB boundary */
> +#define QSPI_ALIGN_REQ 32
> +#define QSPI_BOUNDARY_REQ 1024
> +
> +int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
> + dma_addr_t dma_ptr, uint32_t n_bytes)
> +{
> + struct qspi_cmd_desc *virt_cmd_desc, *prev;
> + uint8_t *virt_data_desc;
> + dma_addr_t dma_cmd_desc, dma_data_desc;
> +
> + if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
> + dev_err(ctrl->dev,
> + "Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
> + return -ENOMEM;
> + }
> +
> + /* allocate for dma cmd descriptor */
> + virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
> + GFP_KERNEL, &dma_cmd_desc);
> + if (!virt_cmd_desc) {
> + dev_err(ctrl->dev,
> + "Could not allocate for cmd_desc\n");
> + return -ENOMEM;
> + }
> + ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
> + ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
> + ctrl->n_cmd_desc++;
> +
> + /* allocate for dma data descriptor if unaligned else use pre-aligned */
> + if (virt_ptr) {
> + virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
> + GFP_KERNEL, &dma_data_desc);
> + if (!virt_data_desc) {
> + dev_err(ctrl->dev,
> + "Could not allocate for data_desc\n");
> + return -ENOMEM;
> + }
> + ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
> + ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
> + ctrl->n_data_desc++;
> +
> + /*
> + * for tx copy xfer data into allocated buffer
> + * for rx setup bounce info to copy after xfer
> + */
> + if (ctrl->xfer.dir == QSPI_WRITE) {
> + memcpy(virt_data_desc, virt_ptr, n_bytes);
> + } else {
> + virt_cmd_desc->bounce_src = virt_data_desc;
> + virt_cmd_desc->bounce_dst = virt_ptr;
> + virt_cmd_desc->bounce_length = n_bytes;
> + }
> + } else {
> + dma_data_desc = dma_ptr;
> + }
> +
> + /* setup cmd descriptor */
> + virt_cmd_desc->data_address = (uint32_t)(uintptr_t)(dma_data_desc);
Why does this need to be casted?
> + virt_cmd_desc->next_descriptor = 0;
> + virt_cmd_desc->direction = ctrl->xfer.dir;
> + virt_cmd_desc->multi_io_mode = ctrl->iomode;
> + virt_cmd_desc->reserved1 = 0;
> + virt_cmd_desc->fragment = 0;
> + virt_cmd_desc->reserved2 = 0;
> + virt_cmd_desc->length = n_bytes;
> +
> + /* update previous descriptor */
> + if (ctrl->n_cmd_desc >= 2) {
> + prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
> + prev->next_descriptor = dma_cmd_desc;
> + prev->fragment = 1;
> + }
> +
> + return 0;
> +}
> +
> +static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
> + struct spi_transfer *xfer)
> +{
> + int ret;
> + struct sg_table *sgt;
> + unsigned int sg_total_len = 0;
> + dma_addr_t dma_ptr_sg;
> + unsigned int dma_len_sg;
> + uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
> + dma_addr_t aligned_ptr;
Don't put types in the variable name.
> + int ii;
Why double i?
> + uint8_t *byte_ptr;
> +
> + if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
> + dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
> + ctrl->n_cmd_desc, ctrl->n_data_desc);
> + return -EIO;
> + }
> +
> + sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
> + if (!sgt->nents || sgt->nents > QSPI_QCOM_MAX_SG) {
> + dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
Hi Vijaya,
kernel test robot noticed the following build warnings:
[auto build test WARNING on broonie-spi/for-next]
[also build test WARNING on robh/for-next broonie-sound/for-next linus/master v6.3-rc5 next-20230404]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vijaya-Krishna-Nivarthi/arm64-dts-qcom-sc7280-Add-stream-id-of-qspi-to-iommus/20230405-020430
base: https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git for-next
patch link: https://lore.kernel.org/r/1680631400-28865-3-git-send-email-quic_vnivarth%40quicinc.com
patch subject: [PATCH 2/2] spi: spi-qcom-qspi: Add DMA mode support
config: arm64-randconfig-r001-20230403 (https://download.01.org/0day-ci/archive/20230405/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project 67409911353323ca5edf2049ef0df54132fa1ca7)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install arm64 cross compiling tool for clang build
# apt-get install binutils-aarch64-linux-gnu
# https://github.com/intel-lab-lkp/linux/commit/72895c4ce2b53c096fa03c9f56211df21faad6dd
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Vijaya-Krishna-Nivarthi/arm64-dts-qcom-sc7280-Add-stream-id-of-qspi-to-iommus/20230405-020430
git checkout 72895c4ce2b53c096fa03c9f56211df21faad6dd
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm64 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=arm64 SHELL=/bin/bash drivers/spi/
If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/
All warnings (new ones prefixed by >>):
>> drivers/spi/spi-qcom-qspi.c:333:5: warning: no previous prototype for function 'qcom_qspi_alloc_desc' [-Wmissing-prototypes]
int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
^
drivers/spi/spi-qcom-qspi.c:333:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
^
static
1 warning generated.
vim +/qcom_qspi_alloc_desc +333 drivers/spi/spi-qcom-qspi.c
332
> 333 int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
334 dma_addr_t dma_ptr, uint32_t n_bytes)
335 {
336 struct qspi_cmd_desc *virt_cmd_desc, *prev;
337 uint8_t *virt_data_desc;
338 dma_addr_t dma_cmd_desc, dma_data_desc;
339
340 if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
341 dev_err(ctrl->dev,
342 "Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
343 return -ENOMEM;
344 }
345
346 /* allocate for dma cmd descriptor */
347 virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
348 GFP_KERNEL, &dma_cmd_desc);
349 if (!virt_cmd_desc) {
350 dev_err(ctrl->dev,
351 "Could not allocate for cmd_desc\n");
352 return -ENOMEM;
353 }
354 ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
355 ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
356 ctrl->n_cmd_desc++;
357
358 /* allocate for dma data descriptor if unaligned else use pre-aligned */
359 if (virt_ptr) {
360 virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
361 GFP_KERNEL, &dma_data_desc);
362 if (!virt_data_desc) {
363 dev_err(ctrl->dev,
364 "Could not allocate for data_desc\n");
365 return -ENOMEM;
366 }
367 ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
368 ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
369 ctrl->n_data_desc++;
370
371 /*
372 * for tx copy xfer data into allocated buffer
373 * for rx setup bounce info to copy after xfer
374 */
375 if (ctrl->xfer.dir == QSPI_WRITE) {
376 memcpy(virt_data_desc, virt_ptr, n_bytes);
377 } else {
378 virt_cmd_desc->bounce_src = virt_data_desc;
379 virt_cmd_desc->bounce_dst = virt_ptr;
380 virt_cmd_desc->bounce_length = n_bytes;
381 }
382 } else {
383 dma_data_desc = dma_ptr;
384 }
385
386 /* setup cmd descriptor */
387 virt_cmd_desc->data_address = (uint32_t)(uintptr_t)(dma_data_desc);
388 virt_cmd_desc->next_descriptor = 0;
389 virt_cmd_desc->direction = ctrl->xfer.dir;
390 virt_cmd_desc->multi_io_mode = ctrl->iomode;
391 virt_cmd_desc->reserved1 = 0;
392 virt_cmd_desc->fragment = 0;
393 virt_cmd_desc->reserved2 = 0;
394 virt_cmd_desc->length = n_bytes;
395
396 /* update previous descriptor */
397 if (ctrl->n_cmd_desc >= 2) {
398 prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
399 prev->next_descriptor = dma_cmd_desc;
400 prev->fragment = 1;
401 }
402
403 return 0;
404 }
405
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests
All reviewers,
Thank you very much for your time and review...
While I am addressing other comments, below are some responses...
On 4/4/2023 11:47 PM, Mark Brown wrote:
> On Tue, Apr 04, 2023 at 11:33:20PM +0530, Vijaya Krishna Nivarthi wrote:
>
> A few quick review comments, mostly coding style though.
>
>> +struct qspi_cmd_desc {
>> + uint32_t data_address;
>> + uint32_t next_descriptor;
>> + uint32_t direction:1;
>> + uint32_t multi_io_mode:3;
>> + uint32_t reserved1:4;
>> + uint32_t fragment:1;
>> + uint32_t reserved2:7;
>> + uint32_t length:16;
>> + //------------------------//
> What does this mean?
That separates the part of cmd_desc that is visible to the HW and the
part that is required by the SW after xfer is complete.
I can add a comment in v2?
>
>> + uint8_t *bounce_src;
>> + uint8_t *bounce_dst;
>> + uint32_t bounce_length;
>> +};
>> +
>> +#define QSPI_MAX_NUM_DESC 5
>> struct qspi_xfer {
> Missing blank line after the define...
Will address in v2
>
>> + for (ii = 0; ii < sgt->nents; ii++)
>> + sg_total_len += sg_dma_len(sgt->sgl + ii);
> Why are we calling the iterator ii here?
Calling it ii helps in finding iterator more easily in code.
should I stick to i in v2?
>
>> + if (ctrl->xfer.dir == QSPI_READ)
>> + byte_ptr = (uint8_t *)xfer->rx_buf;
>> + else
>> + byte_ptr = (uint8_t *)xfer->tx_buf;
> If we need to cast to or from void * there's some sort of problem.
the tx_buf is a const void*
in v2 I will cast for tx_buf only?
>
>> +/* Switch to DMA if transfer length exceeds this */
>> +#define QSPI_MAX_BYTES_FIFO 64
>> +#define NO_TX_DATA_DELAY_FOR_DMA 1
>> +#define DMA_CONDITIONAL (xfer->len > QSPI_MAX_BYTES_FIFO)
>> +
> DMA_CONDITIONAL absolutely should not be a define, this just makes
> things harder to read. Just have everything call can_dma() when needed.
Will address in v2
>
>> +#if NO_TX_DATA_DELAY_FOR_DMA
>> + mstr_cfg &= ~(TX_DATA_OE_DELAY_MSK | TX_DATA_DELAY_MSK);
>> +#endif
> Why would we add extra delays if we don't need them, might someone set
> this and if so when?
I believe its used when some slave devices need a delay between clock
and data.
Its configured as 1 for PIO_MODE(FIFO) right now.
For DMA_MODE we are not using same, both seem to work for DMA.
>
>> + if (ctrl->xfer_mode == QSPI_FIFO) {
>> + } else if (ctrl->xfer_mode == QSPI_DMA) {
>> }
> This should be a switch statement.
Will address in v2
On Thu, Apr 06, 2023 at 08:23:21PM +0530, Vijaya Krishna Nivarthi wrote:
> On 4/4/2023 11:47 PM, Mark Brown wrote:
> > On Tue, Apr 04, 2023 at 11:33:20PM +0530, Vijaya Krishna Nivarthi wrote:
> > > + uint32_t reserved2:7;
> > > + uint32_t length:16;
> > > + //------------------------//
> > What does this mean?
> That separates the part of cmd_desc that is visible to the HW and the part
> that is required by the SW after xfer is complete.
> I can add a comment in v2?
Yes, please.
> > > + for (ii = 0; ii < sgt->nents; ii++)
> > > + sg_total_len += sg_dma_len(sgt->sgl + ii);
> > Why are we calling the iterator ii here?
> Calling it ii helps in finding iterator more easily in code.
> should I stick to i in v2?
Given that multiple people queried this...
> > > + if (ctrl->xfer.dir == QSPI_READ)
> > > + byte_ptr = (uint8_t *)xfer->rx_buf;
> > > + else
> > > + byte_ptr = (uint8_t *)xfer->tx_buf;
> > If we need to cast to or from void * there's some sort of problem.
> the tx_buf is a const void*
> in v2 I will cast for tx_buf only?
Or just keep byte_ptr as const - we're not modifying it are we?
> > > +#if NO_TX_DATA_DELAY_FOR_DMA
> > > + mstr_cfg &= ~(TX_DATA_OE_DELAY_MSK | TX_DATA_DELAY_MSK);
> > > +#endif
> > Why would we add extra delays if we don't need them, might someone set
> > this and if so when?
> I believe its used when some slave devices need a delay between clock and
> data.
> Its configured as 1 for PIO_MODE(FIFO) right now.
> For DMA_MODE we are not using same, both seem to work for DMA.
If some devices need this to be configured it needs to be configured
either from the driver for that device or via DT depending on the exact
requirements.
Thank you for the review...
On 4/5/2023 2:00 AM, Stephen Boyd wrote:
> Quoting Vijaya Krishna Nivarthi (2023-04-04 11:03:18)
>> There are large number of QSPI irqs that fire during boot/init and later
>> on every suspend/resume.
>> This could be made faster by doing DMA instead of PIO.
>> Below is comparison for number of interrupts raised in 2 acenarios...
>> Boot up and stabilise
>> Suspend/Resume
>>
>> Sequence PIO DMA
>> =======================
>> Boot-up 69088 19284
>> S/R 5066 3430
>>
> The interrupts are less, yes, but does it save time or power?
We have not made measurements but the change is expected to improve
performance.
Testing revealed no regressions.
Thank you for the responses...
On 4/6/2023 8:58 PM, Mark Brown wrote:
> On Thu, Apr 06, 2023 at 08:23:21PM +0530, Vijaya Krishna Nivarthi wrote:
>> On 4/4/2023 11:47 PM, Mark Brown wrote:
>>> On Tue, Apr 04, 2023 at 11:33:20PM +0530, Vijaya Krishna Nivarthi wrote:
>>>> + uint32_t reserved2:7;
>>>> + uint32_t length:16;
>>>> + //------------------------//
>>> What does this mean?
>> That separates the part of cmd_desc that is visible to the HW and the part
>> that is required by the SW after xfer is complete.
>> I can add a comment in v2?
> Yes, please.
Added comments
>
>>>> + for (ii = 0; ii < sgt->nents; ii++)
>>>> + sg_total_len += sg_dma_len(sgt->sgl + ii);
>>> Why are we calling the iterator ii here?
>> Calling it ii helps in finding iterator more easily in code.
>> should I stick to i in v2?
> Given that multiple people queried this...
Renamed.
>
>>>> + if (ctrl->xfer.dir == QSPI_READ)
>>>> + byte_ptr = (uint8_t *)xfer->rx_buf;
>>>> + else
>>>> + byte_ptr = (uint8_t *)xfer->tx_buf;
>>> If we need to cast to or from void * there's some sort of problem.
>> the tx_buf is a const void*
>> in v2 I will cast for tx_buf only?
> Or just keep byte_ptr as const - we're not modifying it are we?
We are modifying it, hence did cast for tx_buf only
>
>>>> +#if NO_TX_DATA_DELAY_FOR_DMA
>>>> + mstr_cfg &= ~(TX_DATA_OE_DELAY_MSK | TX_DATA_DELAY_MSK);
>>>> +#endif
>>> Why would we add extra delays if we don't need them, might someone set
>>> this and if so when?
>> I believe its used when some slave devices need a delay between clock and
>> data.
>> Its configured as 1 for PIO_MODE(FIFO) right now.
>> For DMA_MODE we are not using same, both seem to work for DMA.
> If some devices need this to be configured it needs to be configured
> either from the driver for that device or via DT depending on the exact
> requirements.
Retained same MSTR_CONFIG as PIO mode, hence not touching any of the DELAYs
Thank you for the review...
On 4/5/2023 2:15 AM, Stephen Boyd wrote:
> Quoting Vijaya Krishna Nivarthi (2023-04-04 11:03:20)
>> For performance improvement in terms of number of interrupts.
>> Code flow for DMA mode...
>> Allocate for DMA pools in probe()
>> For xfers with len > 64 bytes, return true from can_dma()
>> Thus framework creates and maps an sg table for each xfer buffer.
>> In transfer_one() enable DMA in MSTR_CONFIG.
>> Split the dma buffer of each entry of sgt into a maximum of 3 parts...
>> Unaligned head, aligned middle and tail.
>> For each part create a dma_cmd_descriptor.
>> For head and tail allocate dma_data_descriptors
>> For tx, copy the data into allocated data descriptors
>> For rx, remember the original buffers to copy from-
>> allocated descriptors after xfer is complete
>> For middle part use the aligned dma buffer.
>> Link each dma_cmd_descriptor to next.
>> Thus create a chain of descriptors.
>> Kick-off the xfer by copying 1st cmd descriptor to-
>> NEXT_DMA_DESC_ADDR register.
>> On receiving DMA_CHAIN_DONE interrupt, complete the xfer and-
>> free descriptors.
>> If timeout happens handle error by freeing descriptors.
>> In remove() free DMA pools.
>>
> Write a real commit text, and not psuedo-code for the patch. It should
> sell us on wanting to review the patch, and help us understand the
> importance of the change. The "how" comes from reading the patch
> and code itself.
Modified.
>> Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
>> ---
>> drivers/spi/spi-qcom-qspi.c | 429 ++++++++++++++++++++++++++++++++++++++++----
>> 1 file changed, 398 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/spi/spi-qcom-qspi.c b/drivers/spi/spi-qcom-qspi.c
>> index fab1553..64c3bec 100644
>> --- a/drivers/spi/spi-qcom-qspi.c
>> +++ b/drivers/spi/spi-qcom-qspi.c
>> @@ -13,7 +13,8 @@
>> #include <linux/pm_opp.h>
>> #include <linux/spi/spi.h>
>> #include <linux/spi/spi-mem.h>
>> -
>> +#include <linux/dmapool.h>
>> +#include <linux/dma-mapping.h>
> Sort this alphabetically like all the other headers here.
Done
>> #define QSPI_NUM_CS 2
>> #define QSPI_BYTES_PER_WORD 4
>> @@ -62,6 +63,7 @@
>> #define WR_FIFO_FULL BIT(10)
>> #define WR_FIFO_OVERRUN BIT(11)
>> #define TRANSACTION_DONE BIT(16)
>> +#define DMA_CHAIN_DONE BIT(31)
>> #define QSPI_ERR_IRQS (RESP_FIFO_UNDERRUN | HRESP_FROM_NOC_ERR | \
>> WR_FIFO_OVERRUN)
>> #define QSPI_ALL_IRQS (QSPI_ERR_IRQS | RESP_FIFO_RDY | \
>> @@ -108,6 +110,10 @@
>> #define RD_FIFO_RESET 0x0030
>> #define RESET_FIFO BIT(0)
>>
>> +#define NEXT_DMA_DESC_ADDR 0x0040
>> +#define CURRENT_DMA_DESC_ADDR 0x0044
>> +#define CURRENT_MEM_ADDR 0x0048
>> +
>> #define CUR_MEM_ADDR 0x0048
>> #define HW_VERSION 0x004c
>> #define RD_FIFO 0x0050
>> @@ -120,6 +126,22 @@ enum qspi_dir {
>> QSPI_WRITE,
>> };
>>
>> +struct qspi_cmd_desc {
>> + uint32_t data_address;
> Use u32/u8 instead of uint32_t/uint8_t
Done
>> + uint32_t next_descriptor;
>> + uint32_t direction:1;
>> + uint32_t multi_io_mode:3;
>> + uint32_t reserved1:4;
>> + uint32_t fragment:1;
>> + uint32_t reserved2:7;
>> + uint32_t length:16;
>> + //------------------------//
>> + uint8_t *bounce_src;
>> + uint8_t *bounce_dst;
>> + uint32_t bounce_length;
>> +};
>> +
>> +#define QSPI_MAX_NUM_DESC 5
>> struct qspi_xfer {
>> union {
>> const void *tx_buf;
>> @@ -137,11 +159,30 @@ enum qspi_clocks {
>> QSPI_NUM_CLKS
>> };
>>
>> +enum qspi_xfer_mode {
>> + QSPI_INVALID,
>> + QSPI_FIFO,
>> + QSPI_DMA
>> +};
>> +
>> +/* number of entries in sgt returned from spi framework that we can support */
>> +#define QSPI_QCOM_MAX_SG 5
>> +
>> struct qcom_qspi {
>> void __iomem *base;
>> struct device *dev;
>> struct clk_bulk_data *clks;
>> struct qspi_xfer xfer;
>> + struct dma_pool *dma_cmd_pool;
>> + struct dma_pool *dma_data_pool;
>> + dma_addr_t dma_cmd_desc[3*QSPI_QCOM_MAX_SG];
> Make a define for 3 and 2 and then a define for each equation?
Made defines for 3 and 2
>> + dma_addr_t dma_data_desc[2*QSPI_QCOM_MAX_SG];
>> + void *virt_cmd_desc[3*QSPI_QCOM_MAX_SG];
>> + void *virt_data_desc[2*QSPI_QCOM_MAX_SG];
>> + unsigned int n_cmd_desc;
>> + unsigned int n_data_desc;
>> + int xfer_mode;
>> + u32 iomode;
> Good use of u32
Thank you
>
>> struct icc_path *icc_path_cpu_to_qspi;
>> unsigned long last_speed;
>> /* Lock to protect data accessed by IRQs */
>> @@ -258,6 +326,186 @@ static int qcom_qspi_set_speed(struct qcom_qspi *ctrl, unsigned long speed_hz)
>> return 0;
>> }
>>
>> +/* aligned to 32 bytes, not to cross 1KB boundary */
>> +#define QSPI_ALIGN_REQ 32
>> +#define QSPI_BOUNDARY_REQ 1024
>> +
>> +int qcom_qspi_alloc_desc(struct qcom_qspi *ctrl, uint8_t *virt_ptr,
>> + dma_addr_t dma_ptr, uint32_t n_bytes)
>> +{
>> + struct qspi_cmd_desc *virt_cmd_desc, *prev;
>> + uint8_t *virt_data_desc;
>> + dma_addr_t dma_cmd_desc, dma_data_desc;
>> +
>> + if (virt_ptr && n_bytes >= QSPI_ALIGN_REQ) {
>> + dev_err(ctrl->dev,
>> + "Exiting to avert memory overwrite, n_bytes-%d\n", n_bytes);
>> + return -ENOMEM;
>> + }
>> +
>> + /* allocate for dma cmd descriptor */
>> + virt_cmd_desc = (struct qspi_cmd_desc *)dma_pool_alloc(ctrl->dma_cmd_pool,
>> + GFP_KERNEL, &dma_cmd_desc);
>> + if (!virt_cmd_desc) {
>> + dev_err(ctrl->dev,
>> + "Could not allocate for cmd_desc\n");
>> + return -ENOMEM;
>> + }
>> + ctrl->virt_cmd_desc[ctrl->n_cmd_desc] = virt_cmd_desc;
>> + ctrl->dma_cmd_desc[ctrl->n_cmd_desc] = dma_cmd_desc;
>> + ctrl->n_cmd_desc++;
>> +
>> + /* allocate for dma data descriptor if unaligned else use pre-aligned */
>> + if (virt_ptr) {
>> + virt_data_desc = (uint8_t *)dma_pool_zalloc(ctrl->dma_data_pool,
>> + GFP_KERNEL, &dma_data_desc);
>> + if (!virt_data_desc) {
>> + dev_err(ctrl->dev,
>> + "Could not allocate for data_desc\n");
>> + return -ENOMEM;
>> + }
>> + ctrl->virt_data_desc[ctrl->n_data_desc] = virt_data_desc;
>> + ctrl->dma_data_desc[ctrl->n_data_desc] = dma_data_desc;
>> + ctrl->n_data_desc++;
>> +
>> + /*
>> + * for tx copy xfer data into allocated buffer
>> + * for rx setup bounce info to copy after xfer
>> + */
>> + if (ctrl->xfer.dir == QSPI_WRITE) {
>> + memcpy(virt_data_desc, virt_ptr, n_bytes);
>> + } else {
>> + virt_cmd_desc->bounce_src = virt_data_desc;
>> + virt_cmd_desc->bounce_dst = virt_ptr;
>> + virt_cmd_desc->bounce_length = n_bytes;
>> + }
>> + } else {
>> + dma_data_desc = dma_ptr;
>> + }
>> +
>> + /* setup cmd descriptor */
>> + virt_cmd_desc->data_address = (uint32_t)(uintptr_t)(dma_data_desc);
> Why does this need to be casted?
Agreed not required, removed.
>> + virt_cmd_desc->next_descriptor = 0;
>> + virt_cmd_desc->direction = ctrl->xfer.dir;
>> + virt_cmd_desc->multi_io_mode = ctrl->iomode;
>> + virt_cmd_desc->reserved1 = 0;
>> + virt_cmd_desc->fragment = 0;
>> + virt_cmd_desc->reserved2 = 0;
>> + virt_cmd_desc->length = n_bytes;
>> +
>> + /* update previous descriptor */
>> + if (ctrl->n_cmd_desc >= 2) {
>> + prev = (ctrl->virt_cmd_desc)[ctrl->n_cmd_desc - 2];
>> + prev->next_descriptor = dma_cmd_desc;
>> + prev->fragment = 1;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int qcom_qspi_setup_dma_desc(struct qcom_qspi *ctrl,
>> + struct spi_transfer *xfer)
>> +{
>> + int ret;
>> + struct sg_table *sgt;
>> + unsigned int sg_total_len = 0;
>> + dma_addr_t dma_ptr_sg;
>> + unsigned int dma_len_sg;
>> + uint32_t prolog_bytes, aligned_bytes, epilog_bytes;
>> + dma_addr_t aligned_ptr;
> Don't put types in the variable name.
Renamed
>
>> + int ii;
> Why double i?
Renamed
>
>> + uint8_t *byte_ptr;
>> +
>> + if (ctrl->n_cmd_desc || ctrl->n_data_desc) {
>> + dev_err(ctrl->dev, "Remnant dma buffers cmd-%d, data-%d\n",
>> + ctrl->n_cmd_desc, ctrl->n_data_desc);
>> + return -EIO;
>> + }
>> +
>> + sgt = (ctrl->xfer.dir == QSPI_READ) ? &xfer->rx_sg : &xfer->tx_sg;
>> + if (!sgt->nents || sgt->nents > QSPI_QCOM_MAX_SG) {
>> + dev_err(ctrl->dev, "Cannot handle %d entries in scatter list\n", sgt->nents);
Thank you very much for the review...
On 4/4/2023 11:49 PM, Krzysztof Kozlowski wrote:
> On 04/04/2023 20:03, Vijaya Krishna Nivarthi wrote:
>> This is done as part of adding DMA support to qspi driver.
>>
>> Signed-off-by: Vijaya Krishna Nivarthi <[email protected]>
>> ---
>> arch/arm64/boot/dts/qcom/sc7280.dtsi | 1 +
>> 1 file changed, 1 insertion(+)
>>
>> diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi b/arch/arm64/boot/dts/qcom/sc7280.dtsi
>> index 5e6f9f4..9e05285 100644
>> --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
>> +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
>> @@ -3434,6 +3434,7 @@
>> qspi: spi@88dc000 {
>> compatible = "qcom,sc7280-qspi", "qcom,qspi-v1";
>> reg = <0 0x088dc000 0 0x1000>;
>> + iommus = <&apps_smmu 0x20 0x0>
> Does not look like you tested the DTS against bindings. Please run `make
> dtbs_check` (see Documentation/devicetree/bindings/writing-schema.rst
> for instructions).
Added changes to yaml file, ran the check and ensured it builds fine.
> Best regards,
> Krzysztof
>
On Wed, Apr 12, 2023 at 08:59:06PM +0530, Vijaya Krishna Nivarthi wrote:
> On 4/6/2023 8:58 PM, Mark Brown wrote:
> > > > > + if (ctrl->xfer.dir == QSPI_READ)
> > > > > + byte_ptr = (uint8_t *)xfer->rx_buf;
> > > > > + else
> > > > > + byte_ptr = (uint8_t *)xfer->tx_buf;
> > > > If we need to cast to or from void * there's some sort of problem.
> > > the tx_buf is a const void*
> > > in v2 I will cast for tx_buf only?
> > Or just keep byte_ptr as const - we're not modifying it are we?
> We are modifying it, hence did cast for tx_buf only
If it's being modified won't that upset the callers that thought it was
const and didn't expect the data to change?
On 4/12/2023 9:42 PM, Mark Brown wrote:
> On Wed, Apr 12, 2023 at 08:59:06PM +0530, Vijaya Krishna Nivarthi wrote:
>> On 4/6/2023 8:58 PM, Mark Brown wrote:
>>>>>> + if (ctrl->xfer.dir == QSPI_READ)
>>>>>> + byte_ptr = (uint8_t *)xfer->rx_buf;
>>>>>> + else
>>>>>> + byte_ptr = (uint8_t *)xfer->tx_buf;
>>>>> If we need to cast to or from void * there's some sort of problem.
>>>> the tx_buf is a const void*
>>>> in v2 I will cast for tx_buf only?
>>> Or just keep byte_ptr as const - we're not modifying it are we?
>> We are modifying it, hence did cast for tx_buf only
> If it's being modified won't that upset the callers that thought it was
> const and didn't expect the data to change?
I believe callers wouldn't be upset.
The byte_ptr is being modified (incremented)
It is initialised to tx_buf (a const*) and keeps getting incremented to
parse data.
No data change.
That should be ok?