2012-08-22 07:35:26

by Shi Xuelin-B29237

[permalink] [raw]
Subject: [PATCH] RAID/DMA/caamxor: support XOR offload by CAAM

From: Xuelin Shi <[email protected]>

Add XOR offloading functionality by CAAM and interface with async_tx layer
so that it can be used for RAID parity computation.

Signed-off-by: Naveen Burmi <[email protected]>
Signed-off-by: Yuan Kang <[email protected]>
Signed-off-by: Xuelin Shi <[email protected]>
---
drivers/crypto/caam/Kconfig | 15 +
drivers/crypto/caam/Makefile | 1 +
drivers/crypto/caam/caamxor.c | 880 +++++++++++++++++++++++++++++++++++++
drivers/crypto/caam/desc_constr.h | 53 +++-
drivers/crypto/caam/intern.h | 7 +
drivers/crypto/caam/jr.c | 8 +-
6 files changed, 959 insertions(+), 5 deletions(-)
create mode 100644 drivers/crypto/caam/caamxor.c

diff --git a/drivers/crypto/caam/Kconfig b/drivers/crypto/caam/Kconfig
index 65c7668..643ca0a 100644
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
@@ -98,3 +98,18 @@ config CRYPTO_DEV_FSL_CAAM_RNG_API

To compile this as a module, choose M here: the module
will be called caamrng.
+
+config CRYPTO_DEV_FSL_CAAM_DMAXOR_API
+ tristate "Freescale CAAM XOR support"
+ depends on CRYPTO_DEV_FSL_CAAM && EXPERIMENTAL
+ default n
+ select DMA_ENGINE
+ select ASYNC_XOR
+ help
+ Selecting this will offload the xor-parity-calculation for
+ users of the Asynchronous Transfers/Transforms API (such as
+ md-raid5 driver) to the SEC4.
+
+
+ To compile this as a module, choose M here: the module
+ will be called caamxor.
diff --git a/drivers/crypto/caam/Makefile b/drivers/crypto/caam/Makefile
index b1eb448..457192c 100644
--- a/drivers/crypto/caam/Makefile
+++ b/drivers/crypto/caam/Makefile
@@ -6,5 +6,6 @@ obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM) += caam.o
obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_CRYPTO_API) += caamalg.o
obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_AHASH_API) += caamhash.o
obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_RNG_API) += caamrng.o
+obj-$(CONFIG_CRYPTO_DEV_FSL_CAAM_DMAXOR_API) += caamxor.o

caam-objs := ctrl.o jr.o error.o key_gen.o
diff --git a/drivers/crypto/caam/caamxor.c b/drivers/crypto/caam/caamxor.c
new file mode 100644
index 0000000..f060cff
--- /dev/null
+++ b/drivers/crypto/caam/caamxor.c
@@ -0,0 +1,880 @@
+/*
+ * caam - Freescale Integrated Security Engine (SEC) device driver
+ * Support for off-loading XOR Parity Calculations to CAAM.
+ *
+ * Copyright 2011 Freescale Semiconductor, Inc
+ *
+ * relationship between job descriptors, shared descriptors and sources:
+ * ------------------------------ -------------------
+ * | ShareDesc |<------\ | JobDesc |
+ * | Load src pointers to ctx | \--| ShareDesc ptr |
+ * | new src jump dst: |<-----\ | SEQ_OUT_PTR |
+ * | Load ith src | | | (output buffer) |
+ * | new src mv dst: | | | (output length) |
+ * | (ith src commands) | | | SEQ_IN_PTR |
+ * | load: |<---\ | | (src commands) |----\
+ * | Seq load chunk | | | ------------------- |
+ * | return: |<---|-|-\ |
+ * | XOR quarter chunk | | | | |
+ * | Pass complete? |----^-^---\ |
+ * | Half chunk left? |----^-+ | | |
+ * | Default |----^-^-+ | |
+ * | store: |<---|-|-|-/ |
+ * | Seq store chunk | | | | ------------------- |
+ * | No data left to write? |X | | | | first src ptr |<-/
+ * | Put src1 chunk in result | | | | | first src len |
+ * | Default |----^-+ | /-| shared hdr jump |
+ * | first: |<---|-|-|-/ | nop (if needed) |
+ * | No data left to read? |----^-^-+ -------------------
+ * | Seq load chunk | | | | | ith src ptr |
+ * | Load src2 | | | | | ith src len |
+ * | Not first pass? |----^-^-/ | load src i + 1 |
+ * | first pass: | | | | nop (if needed) |
+ * | Put src1 chunk in result | | | -------------------
+ * | set output size | | | | last src ptr |
+ * | Default |----^-/ | last src len |
+ * | last: |<---|--------| shared hdr jump |
+ * | Update index | | | nop (if needed) |
+ * | Load src1 | | -------------------
+ * | Default |----/
+ * ------------------------------
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dmaengine.h>
+
+#include "compat.h"
+#include "regs.h"
+#include "jr.h"
+#include "error.h"
+#include "intern.h"
+#include "desc.h"
+#include "desc_constr.h"
+
+#define MAX_INITIAL_DESCS 64
+#define MAX_XOR_SRCS 8
+
+#define JOB_DESC_BYTES (4 * CAAM_CMD_SZ + 3 * CAAM_PTR_SZ)
+#define JOB_DESC_LEN (JOB_DESC_BYTES / CAAM_CMD_SZ)
+#define CMD_DESC_LEN 32
+
+#define LONG_PTR (CAAM_PTR_SZ > CAAM_CMD_SZ)
+
+#define CTX1_SLOTS 4
+#define SRC_CMD_BYTES (4 * CAAM_CMD_SZ)
+#define SRC_CMD_LEN (SRC_CMD_BYTES / CAAM_CMD_SZ)
+#define CHUNK_SIZE 128
+#define CHUNK_SIZE_H 64
+#define CHUNK_SIZE_Q 32
+#define REG_SIZE 8
+
+#define CMD_MOVE_OVERFLOW_LEN 1
+
+#define LABEL_SRC_JMP_BYTES (5 * CAAM_CMD_SZ)
+#define LABEL_SRC_JMP (LABEL_SRC_JMP_BYTES / CAAM_CMD_SZ)
+#define LABEL_SRC_MV_BYTES (CAAM_CMD_SZ + LABEL_SRC_JMP_BYTES)
+#define LABEL_SRC_MV (LABEL_SRC_MV_BYTES / CAAM_CMD_SZ)
+#define LABEL_FIRST_BYTES (28 * CAAM_CMD_SZ + LABEL_SRC_MV_BYTES)
+#define LABEL_FIRST (LABEL_FIRST_BYTES / CAAM_CMD_SZ)
+#define LABEL_LAST_BYTES (13 * CAAM_CMD_SZ + LABEL_FIRST_BYTES)
+#define LABEL_LAST (LABEL_LAST_BYTES / CAAM_CMD_SZ)
+#define SH_DESC_BYTES (5 * CAAM_CMD_SZ + LABEL_LAST_BYTES)
+#define SH_DESC_LEN (SH_DESC_BYTES / CAAM_CMD_SZ)
+
+#ifdef DEBUG
+/* for print_hex_dumps with line references */
+#define xstr(s) str(s)
+#define str(s) (#s)
+#define debug(format, arg...) printk(format, arg)
+#else
+#define debug(format, arg...)
+#endif
+
+struct caam_xor_sh_desc {
+ u32 desc[SH_DESC_LEN + CMD_MOVE_OVERFLOW_LEN];
+ dma_addr_t sh_desc_phys;
+};
+
+struct caam_dma_async_tx_desc {
+ struct dma_async_tx_descriptor async_tx;
+ struct list_head node;
+ struct caam_dma_jr *dma_jr;
+ u32 job_desc[JOB_DESC_LEN];
+ u32 cmd_desc[CMD_DESC_LEN];
+ dma_addr_t cmd_desc_phys;
+ dma_addr_t dest;
+ dma_addr_t src[MAX_XOR_SRCS];
+ u32 src_cnt;
+ u32 dma_len;
+};
+
+struct caam_dma_desc_pool {
+ int desc_cnt;
+ struct list_head head;
+};
+
+/*
+ * caam_dma_jr - job ring/channel data
+ * @completed_cookie: cookie of latest latest, completed job
+ * @chan: dma channel used by async_tx API
+ * @desc_lock: lock on job descriptor
+ * @submit_q: queue of pending (submitted, but not enqueued) jobs
+ * @done_lock: lock on done_not_acked
+ * @done_not_acked: jobs that have been completed by jr, but maybe not acked
+ * @handle_done: tasklet for cleaning done_not_acked
+ * @caam_hw_jr: jr device data
+ * @pool_lock: lock on soft_desc
+ * @soft_desc: pool of pre-allocated caam_dma_async_tx_desc structures
+ */
+struct caam_dma_jr {
+ dma_cookie_t completed_cookie;
+ struct dma_chan chan;
+ struct device *dev;
+ spinlock_t desc_lock;
+ struct list_head submit_q;
+ spinlock_t done_lock;
+ struct list_head done_not_acked;
+ struct tasklet_struct handle_done;
+ struct caam_drv_private_jr *caam_hw_jr;
+ spinlock_t pool_lock;
+ struct caam_dma_desc_pool *soft_desc;
+};
+
+static inline u32 load_source(u32 ctx, u32 offset, u32 target)
+{
+ return ctx | MOVE_DEST_DESCBUF | SRC_CMD_BYTES |
+ (target << (2 + MOVE_OFFSET_SHIFT)) |
+ (offset << MOVE_AUX_SHIFT);
+}
+
+static inline u32 *write_load_source(u32 *desc, u32 ctx, u32 offset, u32 target)
+{
+ return write_move(desc, load_source(ctx, offset, target));
+}
+
+/* generate source commands and job descriptor for each request */
+static void prepare_caam_xor_desc(struct device *dev,
+ struct caam_dma_async_tx_desc *desc,
+ dma_addr_t sh_desc_phys,
+ dma_addr_t dest, dma_addr_t *src,
+ u32 src_cnt, size_t len)
+{
+ u32 label_src_mv = LABEL_SRC_MV + CMD_MOVE_OVERFLOW_LEN;
+ u32 label_first = LABEL_FIRST + CMD_MOVE_OVERFLOW_LEN;
+ u32 label_last = LABEL_LAST + CMD_MOVE_OVERFLOW_LEN;
+ u32 sh_desc_len = SH_DESC_LEN + CMD_MOVE_OVERFLOW_LEN;
+ int i;
+ u32 *job_descptr = desc->job_desc;
+ u32 *cmd_desc = desc->cmd_desc;
+
+ desc->dest = dest;
+ memcpy(desc->src, src, src_cnt*sizeof(dma_addr_t));
+ desc->src_cnt = src_cnt;
+ desc->dma_len = len;
+
+ /* first source: jump to special commands */
+ cmd_desc = write_ptr(cmd_desc, src[0]);
+ cmd_desc = write_cmd(cmd_desc, len);
+ init_sh_desc(cmd_desc, (label_first & HDR_START_IDX_MASK) <<
+ HDR_START_IDX_SHIFT);
+ cmd_desc++;
+ if (!LONG_PTR)
+ cmd_desc = write_nop(cmd_desc, 1);
+
+ i = 1;
+ /* sources that load next source from first context */
+ while (i < src_cnt - 1 && i < CTX1_SLOTS - 1) {
+ cmd_desc = write_ptr(cmd_desc, src[i]);
+ cmd_desc = write_cmd(cmd_desc, len);
+ cmd_desc = write_load_source(cmd_desc, MOVE_SRC_CLASS1CTX, i +
+ 1, label_src_mv);
+ if (!LONG_PTR)
+ cmd_desc = write_nop(cmd_desc, 1);
+ i++;
+ }
+ /* sources that load next source from second context */
+ while (i < src_cnt - 1) {
+ cmd_desc = write_ptr(cmd_desc, src[i]);
+ cmd_desc = write_cmd(cmd_desc, len);
+ cmd_desc = write_load_source(cmd_desc, MOVE_SRC_CLASS2CTX, i +
+ 1, label_src_mv);
+ if (!LONG_PTR)
+ cmd_desc = write_nop(cmd_desc, 1);
+ i++;
+ }
+
+ /* last source: jump to special commands */
+ cmd_desc = write_ptr(cmd_desc, src[i]);
+ cmd_desc = write_cmd(cmd_desc, len);
+ init_sh_desc(cmd_desc, (label_last & HDR_START_IDX_MASK) <<
+ HDR_START_IDX_SHIFT);
+ cmd_desc++;
+ if (!LONG_PTR)
+ cmd_desc = write_nop(cmd_desc, 1);
+
+ desc->cmd_desc_phys = dma_map_single(dev, desc->cmd_desc,
+ CMD_DESC_LEN * sizeof(u32),
+ DMA_TO_DEVICE);
+ init_job_desc_shared(job_descptr, sh_desc_phys, sh_desc_len,
+ HDR_SHARE_WAIT | HDR_REVERSE);
+
+ append_seq_out_ptr(job_descptr, dest, len, 0);
+ append_seq_in_ptr_intlen(job_descptr, desc->cmd_desc_phys,
+ MAX_XOR_SRCS * SRC_CMD_BYTES, 0);
+
+#ifdef DEBUG
+ print_hex_dump(KERN_ERR, "job desc @"xstr(__LINE__)": ",
+ DUMP_PREFIX_ADDRESS, 16, 4, job_descptr, CAAM_CMD_SZ *
+ desc_len(job_descptr), 1);
+ print_hex_dump(KERN_ERR, "srcs @"xstr(__LINE__)": ",
+ DUMP_PREFIX_ADDRESS, 16, 4, src, src_cnt * CAAM_PTR_SZ,
+ 1);
+ print_hex_dump(KERN_ERR, "src [email protected]"xstr(__LINE__)": ",
+ DUMP_PREFIX_ADDRESS, 16, 4, desc->cmd_desc,
+ SRC_CMD_BYTES * src_cnt, 1);
+#endif
+}
+
+/* generate shared descriptor for each device */
+static void prepare_caam_xor_sh_desc(u32 *descptr, u32 src_cnt)
+{
+ bool overflow;
+ u32 label_src_jmp, label_src_mv;
+ u32 *store_jump_cmd;
+ u32 label_load, label_return, label_store;
+
+ overflow = src_cnt > CTX1_SLOTS;
+ label_src_jmp = LABEL_SRC_JMP + CMD_MOVE_OVERFLOW_LEN;
+ label_src_mv = label_src_jmp + 1;
+ init_sh_desc(descptr, HDR_SHARE_SERIAL);
+ /* Store up to 4 sources in ctx1 */
+ append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+ LDST_CLASS_1_CCB | (overflow ?
+ (CTX1_SLOTS * SRC_CMD_BYTES) : (src_cnt * SRC_CMD_BYTES)));
+
+ /* Store any overflow in ctx2 */
+ if (overflow)
+ append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+ LDST_CLASS_2_CCB | (src_cnt - 4) * 16);
+ else
+ append_cmd(descptr, CMD_SEQ_LOAD | LDST_SRCDST_BYTE_CONTEXT |
+ LDST_CLASS_2_CCB | 4 * 16);
+
+ append_cmd(descptr, CMD_LOAD | DISABLE_AUTO_INFO_FIFO);
+
+ /* Load first source */
+ append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 0, label_src_mv) |
+ MOVE_WAITCOMP);
+
+ /* Refresh shared descriptor */
+ append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+ ((label_src_jmp & HDR_START_IDX_MASK) <<
+ HDR_START_IDX_SHIFT));
+
+ /* Load source and run loaded commands */
+ append_cmd(descptr, CMD_SEQ_IN_PTR | SQIN_EXT);
+ append_len(descptr, SRC_CMD_LEN);
+
+ /* Skip read data */
+ append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+ /* Load chunk to ififo */
+ label_load = desc_len(descptr);
+ append_seq_fifo_load(descptr, CHUNK_SIZE, FIFOLD_TYPE_PK |
+ LDST_CLASS_1_CCB);
+
+ /* Update added number of bytes in ififo */
+ append_math_add_imm_u32(descptr, VARSEQOUTLEN, VARSEQOUTLEN, IMM,
+ CHUNK_SIZE);
+
+ /* Load chunk from ififo to math registers via DECO alignment block*/
+ append_load_imm_u32(descptr, NFIFOENTRY_LC1 | NFIFOENTRY_DTYPE_MSG |
+ CHUNK_SIZE, LDST_SRCDST_WORD_INFO_FIFO);
+ label_return = desc_len(descptr);
+ append_move(descptr, MOVE_WAITCOMP | MOVE_SRC_INFIFO |
+ MOVE_DEST_MATH0 | CHUNK_SIZE_Q);
+
+ /* XOR math registers with ofifo */
+ append_math_xor(descptr, REG0, REG0, OUTFIFO, REG_SIZE);
+ append_math_xor(descptr, REG1, REG1, OUTFIFO, REG_SIZE);
+ append_math_xor(descptr, REG2, REG2, OUTFIFO, REG_SIZE);
+ append_math_xor(descptr, REG3, REG3, OUTFIFO, REG_SIZE);
+
+ /* Move result to ofifo */
+ append_move(descptr, MOVE_SRC_MATH0 | MOVE_WAITCOMP |
+ MOVE_DEST_OUTFIFO | CHUNK_SIZE_Q);
+
+ /* Update reduced number of bytes in ififo */
+ append_math_sub_imm_u32(descptr, VARSEQOUTLEN, VARSEQOUTLEN, IMM,
+ CHUNK_SIZE_Q);
+
+ /* If ififo has no more data, store chunk */
+ store_jump_cmd = append_jump(descptr, JUMP_TEST_ALL |
+ JUMP_COND_MATH_Z);
+
+ /* If half of chunk left, use next source */
+ append_math_sub_imm_u32(descptr, NONE, VARSEQOUTLEN, IMM,
+ CHUNK_SIZE_H);
+ append_jump_to(descptr, JUMP_TEST_ALL | JUMP_COND_MATH_Z,
+ label_src_jmp);
+
+ /* Else, keep XORing */
+ append_jump_to(descptr, 0, label_return);
+
+ /* Store */
+ label_store = desc_len(descptr);
+ set_jump_tgt_here(descptr, store_jump_cmd);
+
+ /* Store chunk to seqout */
+ append_seq_fifo_store(descptr, CHUNK_SIZE, FIFOST_TYPE_MESSAGE_DATA);
+
+ /* Halt if no more data */
+ append_math_sub(descptr, NONE, SEQOUTLEN, ONE, CAAM_CMD_SZ);
+ append_jump(descptr, JUMP_TYPE_HALT_USER | JUMP_TEST_ALL |
+ JUMP_COND_MATH_N);
+
+ /* Load first source's next chunk to ofifo */
+ append_move(descptr, MOVE_SRC_INFIFO | MOVE_DEST_OUTFIFO |
+ MOVE_WAITCOMP | CHUNK_SIZE);
+
+ /* Goto source */
+ append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+ ((label_src_jmp & HDR_START_IDX_MASK) <<
+ HDR_START_IDX_SHIFT));
+
+ /* First source, skip read data */
+ append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+ /* If no more data to read, go XOR read data */
+ append_math_sub(descptr, NONE, SEQINLEN, ONE, CAAM_CMD_SZ);
+ append_jump_to(descptr, JUMP_TEST_ALL | JUMP_COND_MATH_N,
+ label_return);
+
+ /* Otherwise, load chunk from first source to DECO alignment block */
+ append_seq_fifo_load(descptr, CHUNK_SIZE, FIFOLD_TYPE_PK |
+ LDST_CLASS_1_CCB);
+ append_load_imm_u32(descptr, NFIFOENTRY_LC1 | NFIFOENTRY_DTYPE_MSG |
+ CHUNK_SIZE, LDST_SRCDST_WORD_INFO_FIFO);
+
+ /* Load second source */
+ append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 1, label_src_mv));
+
+ /* XOR previous pass if this is not first pass */
+ append_math_sub(descptr, NONE, VARSEQINLEN, ONE, CAAM_CMD_SZ);
+ append_jump_to(descptr, JUMP_TEST_INVALL | JUMP_COND_MATH_N,
+ label_return);
+
+ /* Else, move chunk for DECO alignment block to ofifo */
+ append_move(descptr, MOVE_SRC_INFIFO | MOVE_DEST_OUTFIFO |
+ MOVE_WAITCOMP | CHUNK_SIZE);
+
+ /* and track number of bytes to write*/
+ append_math_add_imm_u32(descptr, SEQOUTLEN, SEQINLEN, IMM, CHUNK_SIZE);
+
+ /* Goto source */
+ append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+ ((label_src_jmp & HDR_START_IDX_MASK) <<
+ HDR_START_IDX_SHIFT));
+
+ /* Last source, skip read data */
+ append_seq_fifo_load(descptr, 0, KEY_VLF | FIFOLD_CLASS_SKIP);
+
+ /* Update number of bytes to skip */
+ append_math_add_imm_u32(descptr, VARSEQINLEN, VARSEQINLEN, IMM,
+ CHUNK_SIZE);
+
+ /* Load first source */
+ append_move(descptr, load_source(MOVE_SRC_CLASS1CTX, 0, label_src_mv));
+
+ /* Goto data loading */
+ append_cmd(descptr, CMD_SHARED_DESC_HDR | HDR_SHARE_NEVER | HDR_ONE |
+ ((label_load & HDR_START_IDX_MASK) << HDR_START_IDX_SHIFT));
+
+#ifdef DEBUG
+ print_hex_dump(KERN_ERR, "shdesc @"xstr(__LINE__)": ",
+ DUMP_PREFIX_ADDRESS, 16, 4, descptr, CAAM_CMD_SZ *
+ desc_len(descptr), 1);
+#endif
+}
+
+static enum dma_status caam_jr_tx_status(struct dma_chan *chan,
+ dma_cookie_t cookie,
+ struct dma_tx_state *txstate)
+{
+ struct caam_dma_jr *jr = NULL;
+ dma_cookie_t last_used;
+ dma_cookie_t last_complete;
+
+ jr = container_of(chan, struct caam_dma_jr, chan);
+
+ last_used = chan->cookie;
+ last_complete = jr->completed_cookie;
+
+ dma_set_tx_state(txstate, last_complete, last_used, 0);
+
+ return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static inline void try_clear_desc(struct caam_dma_jr *dma_jr)
+{
+ spin_lock_bh(&dma_jr->done_lock);
+ if (!list_empty(&dma_jr->done_not_acked)) {
+ spin_unlock_bh(&dma_jr->done_lock);
+ tasklet_schedule(&dma_jr->handle_done);
+ } else {
+ spin_unlock_bh(&dma_jr->done_lock);
+ }
+}
+
+/*
+ * tasklet function for checking requests that are completed,
+ * but may not have been acked --delete only if acked
+ */
+static void check_done(unsigned long data)
+{
+ struct caam_dma_jr *dma_jr = (struct caam_dma_jr *) data;
+ struct caam_dma_async_tx_desc *desc, *_desc;
+
+ spin_lock_bh(&dma_jr->done_lock);
+ list_for_each_entry_safe(desc, _desc, &dma_jr->done_not_acked, node) {
+ spin_unlock_bh(&dma_jr->done_lock);
+ if (async_tx_test_ack(&desc->async_tx)) {
+ spin_lock_bh(&dma_jr->done_lock);
+ list_del(&desc->node);
+ spin_unlock_bh(&dma_jr->done_lock);
+ spin_lock_bh(&dma_jr->pool_lock);
+ if (dma_jr->soft_desc->desc_cnt < MAX_INITIAL_DESCS) {
+ INIT_LIST_HEAD(&desc->node);
+ list_add(&desc->node, &dma_jr->soft_desc->head);
+ dma_jr->soft_desc->desc_cnt++;
+ spin_unlock_bh(&dma_jr->pool_lock);
+ } else {
+ spin_unlock_bh(&dma_jr->pool_lock);
+ kfree(desc);
+ }
+ }
+ spin_lock_bh(&dma_jr->done_lock);
+ }
+ spin_unlock_bh(&dma_jr->done_lock);
+}
+
+static void caam_dma_xor_done(struct device *dev, u32 *hwdesc, u32 status,
+ void *auxarg)
+{
+ struct caam_dma_async_tx_desc *desc;
+ struct caam_dma_jr *dma_jr;
+ dma_async_tx_callback callback;
+ void *callback_param;
+ struct device *jrdev;
+ enum dma_ctrl_flags flags;
+
+ desc = (struct caam_dma_async_tx_desc *)auxarg;
+ dma_jr = desc->dma_jr;
+ jrdev = dma_jr->caam_hw_jr->parentdev;
+ flags = desc->async_tx.flags;
+
+ if (status) {
+ char tmp[256];
+ dev_err(dev, "%s\n", caam_jr_strstatus(tmp, status));
+ }
+
+ dma_run_dependencies(&desc->async_tx);
+
+ spin_lock_bh(&dma_jr->desc_lock);
+ if (dma_jr->completed_cookie < desc->async_tx.cookie) {
+ dma_jr->completed_cookie = desc->async_tx.cookie;
+ if (dma_jr->completed_cookie == DMA_MAX_COOKIE)
+ dma_jr->completed_cookie = DMA_MIN_COOKIE;
+ }
+ spin_unlock_bh(&dma_jr->desc_lock);
+
+ callback = desc->async_tx.callback;
+ callback_param = desc->async_tx.callback_param;
+
+ dma_unmap_single(jrdev, desc->cmd_desc_phys,
+ CMD_DESC_LEN * sizeof(u32), DMA_TO_DEVICE);
+
+ if (likely(!(flags & DMA_COMPL_SKIP_DEST_UNMAP)))
+ dma_unmap_page(jrdev, desc->dest, desc->dma_len,
+ DMA_BIDIRECTIONAL);
+
+ if (likely(!(flags & DMA_COMPL_SKIP_SRC_UNMAP))) {
+ u32 i;
+ for (i = 0; i < desc->src_cnt; i++) {
+ if (desc->src[i] == desc->dest)
+ continue;
+ dma_unmap_page(jrdev, desc->src[i],
+ desc->dma_len, DMA_TO_DEVICE);
+ }
+ }
+
+ if (async_tx_test_ack(&desc->async_tx)) {
+ spin_lock_bh(&dma_jr->pool_lock);
+ if (dma_jr->soft_desc->desc_cnt < MAX_INITIAL_DESCS) {
+ list_add(&desc->node, &dma_jr->soft_desc->head);
+ dma_jr->soft_desc->desc_cnt++;
+ spin_unlock_bh(&dma_jr->pool_lock);
+ } else {
+ spin_unlock_bh(&dma_jr->pool_lock);
+ kfree(desc);
+ }
+ } else {
+ spin_lock_bh(&dma_jr->done_lock);
+ INIT_LIST_HEAD(&desc->node);
+ list_add_tail(&desc->node, &dma_jr->done_not_acked);
+ spin_unlock_bh(&dma_jr->done_lock);
+ }
+ try_clear_desc(dma_jr);
+
+ if (callback)
+ callback(callback_param);
+}
+
+static void caam_jr_issue_pending(struct dma_chan *chan)
+{
+ struct caam_dma_jr *dma_jr = NULL;
+ struct caam_dma_async_tx_desc *desc, *_desc;
+ struct device *dev;
+
+ dma_jr = container_of(chan, struct caam_dma_jr, chan);
+ dev = dma_jr->dev;
+
+ spin_lock_bh(&dma_jr->desc_lock);
+ list_for_each_entry_safe(desc, _desc, &dma_jr->submit_q, node) {
+ desc->dma_jr = dma_jr;
+ if (caam_jr_enqueue(dev, desc->job_desc,
+ caam_dma_xor_done, desc) < 0) {
+ spin_unlock_bh(&dma_jr->desc_lock);
+ return;
+ }
+
+ list_del(&desc->node);
+ }
+
+ spin_unlock_bh(&dma_jr->desc_lock);
+}
+
+static dma_cookie_t caam_jr_tx_submit(struct dma_async_tx_descriptor *tx)
+{
+ struct caam_dma_async_tx_desc *desc = NULL;
+ struct caam_dma_jr *jr = NULL;
+ dma_cookie_t cookie;
+
+ desc = container_of(tx, struct caam_dma_async_tx_desc, async_tx);
+ jr = container_of(tx->chan, struct caam_dma_jr, chan);
+
+ spin_lock_bh(&jr->desc_lock);
+
+ cookie = jr->chan.cookie + 1;
+ if (cookie < DMA_MIN_COOKIE)
+ cookie = DMA_MIN_COOKIE;
+
+ desc->async_tx.cookie = cookie;
+ jr->chan.cookie = desc->async_tx.cookie;
+ list_add_tail(&desc->node, &jr->submit_q);
+
+ spin_unlock_bh(&jr->desc_lock);
+
+ return cookie;
+}
+
+static struct dma_async_tx_descriptor *
+caam_jr_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
+ unsigned int src_cnt, size_t len, unsigned long flags)
+{
+ struct caam_dma_jr *jr = NULL;
+ struct caam_dma_async_tx_desc *desc = NULL;
+ struct caam_drv_private *priv;
+
+ jr = container_of(chan, struct caam_dma_jr, chan);
+
+ if (src_cnt > MAX_XOR_SRCS) {
+ dev_err(jr->dev, "%d srcs exceed max supported %d srcs\n",
+ src_cnt, MAX_XOR_SRCS);
+ return NULL;
+ }
+
+ spin_lock_bh(&jr->pool_lock);
+ if (jr->soft_desc->desc_cnt) {
+ desc = container_of(jr->soft_desc->head.next,
+ struct caam_dma_async_tx_desc, node);
+ jr->soft_desc->desc_cnt--;
+ list_del(&desc->node);
+ }
+ spin_unlock_bh(&jr->pool_lock);
+
+ if (!desc) {
+ desc = kzalloc(sizeof(struct caam_dma_async_tx_desc),
+ GFP_KERNEL);
+ if (!desc) {
+ dev_err(jr->dev, "Out of memory for XOR async tx\n");
+ try_clear_desc(jr);
+
+ return ERR_PTR(-ENOMEM);
+ }
+
+ desc->async_tx.tx_submit = caam_jr_tx_submit;
+ }
+
+ dma_async_tx_descriptor_init(&desc->async_tx, &jr->chan);
+
+ priv = dev_get_drvdata(jr->caam_hw_jr->parentdev);
+
+ prepare_caam_xor_desc(jr->caam_hw_jr->parentdev, desc,
+ priv->xor_sh_desc[0].sh_desc_phys, dest,
+ src, src_cnt, len);
+
+ desc->async_tx.flags = flags;
+ desc->async_tx.cookie = -EBUSY;
+ return &desc->async_tx;
+}
+
+static void caam_jr_free_chan_resources(struct dma_chan *chan)
+{
+ struct caam_dma_jr *jr = container_of(chan, struct caam_dma_jr, chan);
+ struct caam_dma_async_tx_desc *desc;
+ struct list_head *current_node;
+
+ current_node = jr->soft_desc->head.next;
+ while (jr->soft_desc->desc_cnt > 0) {
+ desc = container_of(current_node, struct caam_dma_async_tx_desc,
+ node);
+ current_node = current_node->next;
+ list_del(&desc->node);
+ kfree(desc);
+ jr->soft_desc->desc_cnt--;
+ }
+
+ kfree(jr->soft_desc);
+
+ return;
+}
+
+static int caam_jr_alloc_chan_resources(struct dma_chan *chan)
+{
+ struct caam_dma_jr *jr = container_of(chan, struct caam_dma_jr, chan);
+ struct caam_dma_async_tx_desc *desc;
+ unsigned int i;
+
+ jr->soft_desc = kzalloc(sizeof(struct caam_dma_desc_pool), GFP_KERNEL);
+ if (!jr->soft_desc) {
+ pr_err("%s: Failed to allocate resources for DMA channel\n",
+ __func__);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&jr->soft_desc->head);
+ for (i = 0; i < MAX_INITIAL_DESCS; i++) {
+ desc = kzalloc(sizeof(struct caam_dma_async_tx_desc),
+ GFP_KERNEL);
+ if (!desc)
+ return -ENOMEM;
+
+ desc->async_tx.tx_submit = caam_jr_tx_submit;
+ jr->soft_desc->desc_cnt++;
+ list_add_tail(&desc->node, &jr->soft_desc->head);
+ }
+
+ return 0;
+}
+
+static int caam_jr_chan_bind(struct device *ctrldev, struct device *dev)
+{
+ struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+ struct caam_drv_private_jr *jrpriv = dev_get_drvdata(dev);
+ struct dma_device *dma_dev = &priv->dma_dev;
+ struct caam_dma_jr *dma_jr;
+
+ dma_jr = kzalloc(sizeof(struct caam_dma_jr), GFP_KERNEL);
+ if (!dma_jr) {
+ dev_err(dev, "Failed to allocate memory for caam job queue\n");
+ return -ENOMEM;
+ }
+
+ dma_jr->chan.device = dma_dev;
+ dma_jr->chan.private = dma_jr;
+
+ INIT_LIST_HEAD(&dma_jr->submit_q);
+ spin_lock_init(&dma_jr->desc_lock);
+ spin_lock_init(&dma_jr->pool_lock);
+ list_add_tail(&dma_jr->chan.device_node, &dma_dev->channels);
+ dma_dev->chancnt++;
+
+ dma_jr->caam_hw_jr = jrpriv;
+ dma_jr->dev = dev;
+ jrpriv->jrdev = dev;
+
+ INIT_LIST_HEAD(&dma_jr->done_not_acked);
+ spin_lock_init(&dma_jr->done_lock);
+ tasklet_init(&dma_jr->handle_done, check_done, (unsigned long) dma_jr);
+
+ return 0;
+}
+
+static inline void caam_jr_chan_unbind(struct device *ctrldev,
+ struct dma_chan *chan)
+{
+ struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+ struct dma_device *dma_dev = &priv->dma_dev;
+
+ list_del(&chan->device_node);
+ dma_dev->chancnt--;
+}
+
+static inline void caam_jr_free(struct dma_chan *chan)
+{
+ struct caam_dma_jr *dma_jr = container_of(chan, struct caam_dma_jr,
+ chan);
+
+ list_del(&chan->device_node);
+ kfree(dma_jr);
+}
+
+static int caam_jr_dma_init(struct device *ctrldev)
+{
+ struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+ struct dma_device *dma_dev = NULL;
+ struct caam_xor_sh_desc *sh_desc;
+ int i;
+
+ priv->xor_sh_desc =
+ kzalloc(sizeof(struct caam_xor_sh_desc), GFP_KERNEL);
+ if (!priv->xor_sh_desc) {
+ dev_err(ctrldev,
+ "Failed to allocate memory for XOR Shared"
+ "descriptor\n");
+ return -ENOMEM;
+ }
+
+ sh_desc = priv->xor_sh_desc;
+ prepare_caam_xor_sh_desc(sh_desc->desc, MAX_XOR_SRCS);
+ sh_desc->sh_desc_phys = dma_map_single(ctrldev, &sh_desc->desc,
+ SH_DESC_LEN * sizeof(u32),
+ DMA_TO_DEVICE);
+
+ dma_dev = &priv->dma_dev;
+ dma_dev->dev = ctrldev;
+ INIT_LIST_HEAD(&dma_dev->channels);
+
+ dma_dev->max_xor = MAX_XOR_SRCS;
+
+ /*
+ * xor transaction must be 128 bytes aligned. For unaligned
+ * transaction, xor-parity calculations will not be off-loaded
+ * to caam
+ */
+ dma_dev->xor_align = 8;
+ dma_cap_set(DMA_XOR, dma_dev->cap_mask);
+
+ dma_dev->device_alloc_chan_resources = caam_jr_alloc_chan_resources;
+ dma_dev->device_tx_status = caam_jr_tx_status;
+ dma_dev->device_issue_pending = caam_jr_issue_pending;
+ dma_dev->device_prep_dma_xor = caam_jr_prep_dma_xor;
+ dma_dev->device_free_chan_resources = caam_jr_free_chan_resources;
+
+ for (i = 0; i < priv->total_jobrs; i++)
+ caam_jr_chan_bind(ctrldev, priv->jrdev[i]);
+
+ dma_async_device_register(dma_dev);
+ dev_info(ctrldev, "caam xor support with %d job rings\n",
+ priv->total_jobrs);
+
+ return 0;
+}
+
+static void caam_jr_dma_exit(struct device *ctrldev)
+{
+ struct caam_drv_private *priv = dev_get_drvdata(ctrldev);
+ struct dma_device *dma_dev = &priv->dma_dev;
+ struct dma_chan *chan, *_chan;
+ struct list_head to_free;
+ int i;
+
+ i = 0;
+ INIT_LIST_HEAD(&to_free);
+ /* before unregistering device, remove channels... */
+ list_for_each_entry_safe(chan, _chan, &dma_dev->channels, device_node) {
+ caam_jr_chan_unbind(ctrldev, chan);
+ list_add_tail(&chan->device_node, &to_free);
+ i++;
+ }
+
+ dma_async_device_unregister(dma_dev);
+
+ /*
+ * ...but don't delete them until device has been unregistered, so
+ * that deleted channels will not be used
+ */
+ list_for_each_entry_safe(chan, _chan, &to_free, device_node) {
+ caam_jr_free(chan);
+ }
+
+ for (i = 0; i < (MAX_XOR_SRCS - 2); i++) {
+ dma_unmap_single(ctrldev, priv->xor_sh_desc[i].sh_desc_phys,
+ SH_DESC_LEN * sizeof(u32), DMA_TO_DEVICE);
+ }
+
+ kfree(priv->xor_sh_desc);
+ dev_info(ctrldev, "caam xor support disabled\n");
+}
+
+static int __init caam_xor_init(void)
+{
+ struct device_node *dev_node;
+ struct platform_device *pdev;
+ struct device *ctrldev;
+ struct caam_drv_private *priv;
+ int err = 0;
+
+ dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
+ if (!dev_node)
+ return -ENODEV;
+
+ pdev = of_find_device_by_node(dev_node);
+ if (!pdev)
+ return -ENODEV;
+
+ ctrldev = &pdev->dev;
+ priv = dev_get_drvdata(ctrldev);
+ of_node_put(dev_node);
+
+ atomic_set(&priv->tfm_count, -1);
+
+ /* register caam device */
+ err = caam_jr_dma_init(ctrldev);
+ if (err)
+ dev_err(ctrldev, "error in xor initialization: %d\n", err);
+
+ return err;
+}
+
+static void __exit caam_xor_exit(void)
+{
+ struct device_node *dev_node;
+ struct platform_device *pdev;
+ struct device *ctrldev;
+ struct caam_drv_private *priv;
+
+ dev_node = of_find_compatible_node(NULL, NULL, "fsl,sec-v4.0");
+ if (!dev_node)
+ return;
+
+ pdev = of_find_device_by_node(dev_node);
+ if (!pdev)
+ return;
+
+ ctrldev = &pdev->dev;
+ of_node_put(dev_node);
+ priv = dev_get_drvdata(ctrldev);
+
+ caam_jr_dma_exit(ctrldev);
+}
+
+module_init(caam_xor_init);
+module_exit(caam_xor_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("FSL CAAM support for crypto API");
+MODULE_AUTHOR("Freescale Semiconductor - NMG/STC");
diff --git a/drivers/crypto/caam/desc_constr.h b/drivers/crypto/caam/desc_constr.h
index c85c1f0..d06bf68 100644
--- a/drivers/crypto/caam/desc_constr.h
+++ b/drivers/crypto/caam/desc_constr.h
@@ -9,11 +9,13 @@
#define IMMEDIATE (1 << 23)
#define CAAM_CMD_SZ sizeof(u32)
#define CAAM_PTR_SZ sizeof(dma_addr_t)
+#define CAAM_PTR_LEN (CAAM_PTR_SZ / CAAM_CMD_SZ)
#define CAAM_DESC_BYTES_MAX (CAAM_CMD_SZ * MAX_CAAM_DESCSIZE)

#ifdef DEBUG
-#define PRINT_POS do { printk(KERN_DEBUG "%02d: %s\n", desc_len(desc),\
- &__func__[sizeof("append")]); } while (0)
+#define PRINT_POS do { pr_debug("%02d: %s\n", desc_len(desc),\
+ &__func__[sizeof("append")]);\
+ } while (0)
#else
#define PRINT_POS
#endif
@@ -82,6 +84,20 @@ static inline void append_ptr(u32 *desc, dma_addr_t ptr)
(*desc) += CAAM_PTR_SZ / CAAM_CMD_SZ;
}

+/* Write command without affecting header, and return pointer to next word */
+static inline u32 *write_ptr(u32 *desc, dma_addr_t ptr)
+{
+ memcpy(desc, &ptr, CAAM_PTR_SZ);
+
+ return desc + CAAM_PTR_LEN;
+}
+
+/* Increase descriptor length */
+static inline void append_len(u32 *desc, unsigned int len)
+{
+ (*desc) += len;
+}
+
static inline void init_job_desc_shared(u32 *desc, dma_addr_t ptr, int len,
u32 options)
{
@@ -110,6 +126,14 @@ static inline void append_cmd(u32 *desc, u32 command)
(*desc)++;
}

+/* Write command without affecting header, and return pointer to next word */
+static inline u32 *write_cmd(u32 *desc, u32 command)
+{
+ *desc = command;
+
+ return desc + 1;
+}
+
static inline void append_cmd_ptr(u32 *desc, dma_addr_t ptr, int len,
u32 command)
{
@@ -143,11 +167,28 @@ static inline u32 *append_jump(u32 *desc, u32 options)
return cmd;
}

+/* Given destination, as offset from header, append jump */
+static inline void append_jump_to(u32 *desc, u32 options, u32 target)
+{
+ PRINT_POS;
+
+ append_jump(desc, options | ((target - desc_len(desc)) &
+ JUMP_OFFSET_MASK));
+}
+
static inline void set_jump_tgt_here(u32 *desc, u32 *jump_cmd)
{
*jump_cmd = *jump_cmd | (desc_len(desc) - (jump_cmd - desc));
}

+/* len words have no commands */
+static inline u32 *write_nop(u32 *desc, int len)
+{
+ *desc = CMD_JUMP | len;
+
+ return desc + len;
+}
+
#define APPEND_CMD(cmd, op) \
static inline void append_##cmd(u32 *desc, u32 options) \
{ \
@@ -157,6 +198,14 @@ static inline void append_##cmd(u32 *desc, u32 options) \
APPEND_CMD(operation, OPERATION)
APPEND_CMD(move, MOVE)

+#define WRITE_CMD(cmd, op) \
+static inline u32 *write_##cmd(u32 *desc, u32 options) \
+{ \
+ PRINT_POS; \
+ return write_cmd(desc, CMD_##op | options); \
+}
+WRITE_CMD(move, MOVE)
+
#define APPEND_CMD_LEN(cmd, op) \
static inline void append_##cmd(u32 *desc, unsigned int len, u32 options) \
{ \
diff --git a/drivers/crypto/caam/intern.h b/drivers/crypto/caam/intern.h
index 5cd4c1b..2b41e31 100644
--- a/drivers/crypto/caam/intern.h
+++ b/drivers/crypto/caam/intern.h
@@ -26,6 +26,8 @@
#define JOBR_INTC_COUNT_THLD 0
#endif

+#define CAAM_NAPI_WEIGHT 63
+
/*
* Storage for tracking each in-process entry moving across a ring
* Each entry on an output ring needs one of these
@@ -58,6 +60,7 @@ struct caam_drv_private_jr {
int out_ring_read_index; /* Output index "tail" */
int tail; /* entinfo (s/w ring) tail index */
struct jr_outentry *outring; /* Base of output ring, DMA-safe */
+ struct device *jrdev;
};

/*
@@ -91,6 +94,10 @@ struct caam_drv_private {
/* list of registered hash algorithms (mk generic context handle?) */
struct list_head hash_list;

+ /* For DMA-XOR support */
+ struct dma_device dma_dev;
+ struct caam_xor_sh_desc *xor_sh_desc;
+
/*
* debugfs entries for developer view into driver/device
* variables at runtime.
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index 53c8c51..8dc81cf 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -80,8 +80,8 @@ static void caam_jr_dequeue(unsigned long devarg)
/* we should never fail to find a matching descriptor */
BUG_ON(CIRC_CNT(head, tail + i, JOBR_DEPTH) <= 0);

- /* Unmap just-run descriptor so we can post-process */
- dma_unmap_single(dev, jrp->outring[hw_idx].desc,
+ /* Unmap just-run job descriptor so we can post-process */
+ dma_unmap_single(jrp->jrdev, jrp->outring[hw_idx].desc,
jrp->entinfo[sw_idx].desc_size,
DMA_TO_DEVICE);

@@ -230,7 +230,7 @@ int caam_jr_enqueue(struct device *dev, u32 *desc,
dma_addr_t desc_dma;

desc_size = (*desc & HDR_JD_LENGTH_MASK) * sizeof(u32);
- desc_dma = dma_map_single(dev, desc, desc_size, DMA_TO_DEVICE);
+ desc_dma = dma_map_single(jrp->jrdev, desc, desc_size, DMA_TO_DEVICE);
if (dma_mapping_error(dev, desc_dma)) {
dev_err(dev, "caam_jr_enqueue(): can't map jobdesc\n");
return -EIO;
@@ -466,6 +466,8 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
else
dma_set_mask(jrdev, DMA_BIT_MASK(32));

+ jrpriv->jrdev = jrdev;
+
/* Identify the interrupt */
jrpriv->irq = of_irq_to_resource(np, 0, NULL);

--
1.7.0.4