Hi,
Sorry to spam the list so quickly again, but v3 had a critical
bug that needs fixing. Changes since v3:
- Addressed Andrews review comments (thanks Andrew!)
- sg_next() grew a silly bug right before posting, where it would
pass back the chain entry instead of the next one. Thanks to
Benny Halevy for spotting that.
I didn't include the big drivers portion of the patch since it
hasn't changed, so grab that from the v3 posting.
It's a subsystem function, prefix it as such.
Signed-off-by: Jens Axboe <[email protected]>
---
crypto/digest.c | 2 +-
crypto/scatterwalk.c | 2 +-
crypto/scatterwalk.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/crypto/digest.c b/crypto/digest.c
index 1bf7414..e56de67 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -77,7 +77,7 @@ static int update2(struct hash_desc *desc,
if (!nbytes)
break;
- sg = sg_next(sg);
+ sg = scatterwalk_sg_next(sg);
}
return 0;
diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c
index 81afd17..2e51f82 100644
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -70,7 +70,7 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out,
walk->offset += PAGE_SIZE - 1;
walk->offset &= PAGE_MASK;
if (walk->offset >= walk->sg->offset + walk->sg->length)
- scatterwalk_start(walk, sg_next(walk->sg));
+ scatterwalk_start(walk, scatterwalk_sg_next(walk->sg));
}
}
diff --git a/crypto/scatterwalk.h b/crypto/scatterwalk.h
index f1592cc..e049c62 100644
--- a/crypto/scatterwalk.h
+++ b/crypto/scatterwalk.h
@@ -20,7 +20,7 @@
#include "internal.h"
-static inline struct scatterlist *sg_next(struct scatterlist *sg)
+static inline struct scatterlist *scatterwalk_sg_next(struct scatterlist *sg)
{
return (++sg)->length ? sg : (void *)sg->page;
}
--
1.5.2.rc1
First step to being able to change the scatterlist setup without
having to modify drivers (a lot :-)
Signed-off-by: Jens Axboe <[email protected]>
---
include/linux/scatterlist.h | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 4efbd9c..bed5ab4 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -20,4 +20,13 @@ static inline void sg_init_one(struct scatterlist *sg, const void *buf,
sg_set_buf(sg, buf, buflen);
}
+#define sg_next(sg) ((sg) + 1)
+#define sg_last(sg, nents) (&(sg[(nents) - 1]))
+
+/*
+ * Loop over each sg element, following the pointer to a new list if necessary
+ */
+#define for_each_sg(sglist, sg, nr, __i) \
+ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+
#endif /* _LINUX_SCATTERLIST_H */
--
1.5.2.rc1
This converts libata to using the sg helpers for looking up sg
elements, instead of doing it manually.
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/ata/libata-core.c | 30 ++++++++++++++++--------------
include/linux/libata.h | 16 ++++++++++------
2 files changed, 26 insertions(+), 20 deletions(-)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 4595d1f..fabb1f4 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1370,7 +1370,7 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
*/
unsigned ata_exec_internal_sg(struct ata_device *dev,
struct ata_taskfile *tf, const u8 *cdb,
- int dma_dir, struct scatterlist *sg,
+ int dma_dir, struct scatterlist *sgl,
unsigned int n_elem)
{
struct ata_port *ap = dev->ap;
@@ -1428,11 +1428,12 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
qc->dma_dir = dma_dir;
if (dma_dir != DMA_NONE) {
unsigned int i, buflen = 0;
+ struct scatterlist *sg;
- for (i = 0; i < n_elem; i++)
- buflen += sg[i].length;
+ for_each_sg(sgl, sg, n_elem, i)
+ buflen += sg->length;
- ata_sg_init(qc, sg, n_elem);
+ ata_sg_init(qc, sgl, n_elem);
qc->nbytes = buflen;
}
@@ -3982,7 +3983,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc)
if (qc->n_elem)
dma_unmap_sg(ap->dev, sg, qc->n_elem, dir);
/* restore last sg */
- sg[qc->orig_n_elem - 1].length += qc->pad_len;
+ sg_last(sg, qc->orig_n_elem)->length += qc->pad_len;
if (pad_buf) {
struct scatterlist *psg = &qc->pad_sgent;
void *addr = kmap_atomic(psg->page, KM_IRQ0);
@@ -4141,6 +4142,7 @@ void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen)
qc->orig_n_elem = 1;
qc->buf_virt = buf;
qc->nbytes = buflen;
+ qc->cursg = qc->__sg;
sg_init_one(&qc->sgent, buf, buflen);
}
@@ -4166,6 +4168,7 @@ void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg,
qc->__sg = sg;
qc->n_elem = n_elem;
qc->orig_n_elem = n_elem;
+ qc->cursg = qc->__sg;
}
/**
@@ -4255,7 +4258,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct scatterlist *sg = qc->__sg;
- struct scatterlist *lsg = &sg[qc->n_elem - 1];
+ struct scatterlist *lsg = sg_last(qc->__sg, qc->n_elem);
int n_elem, pre_n_elem, dir, trim_sg = 0;
VPRINTK("ENTER, ata%u\n", ap->print_id);
@@ -4419,7 +4422,6 @@ void ata_data_xfer_noirq(struct ata_device *adev, unsigned char *buf,
static void ata_pio_sector(struct ata_queued_cmd *qc)
{
int do_write = (qc->tf.flags & ATA_TFLAG_WRITE);
- struct scatterlist *sg = qc->__sg;
struct ata_port *ap = qc->ap;
struct page *page;
unsigned int offset;
@@ -4428,8 +4430,8 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
if (qc->curbytes == qc->nbytes - qc->sect_size)
ap->hsm_task_state = HSM_ST_LAST;
- page = sg[qc->cursg].page;
- offset = sg[qc->cursg].offset + qc->cursg_ofs;
+ page = qc->cursg->page;
+ offset = qc->cursg->offset + qc->cursg_ofs;
/* get the current page and offset */
page = nth_page(page, (offset >> PAGE_SHIFT));
@@ -4457,8 +4459,8 @@ static void ata_pio_sector(struct ata_queued_cmd *qc)
qc->curbytes += qc->sect_size;
qc->cursg_ofs += qc->sect_size;
- if (qc->cursg_ofs == (&sg[qc->cursg])->length) {
- qc->cursg++;
+ if (qc->cursg_ofs == qc->cursg->length) {
+ qc->cursg = sg_next(qc->cursg);
qc->cursg_ofs = 0;
}
}
@@ -4551,7 +4553,7 @@ static void __atapi_pio_bytes(struct ata_queued_cmd *qc, unsigned int bytes)
ap->hsm_task_state = HSM_ST_LAST;
next_sg:
- if (unlikely(qc->cursg >= qc->n_elem)) {
+ if (unlikely(qc->cursg == sg_last(qc->__sg, qc->n_elem))) {
/*
* The end of qc->sg is reached and the device expects
* more data to transfer. In order not to overrun qc->sg
@@ -4574,7 +4576,7 @@ next_sg:
return;
}
- sg = &qc->__sg[qc->cursg];
+ sg = qc->cursg;
page = sg->page;
offset = sg->offset + qc->cursg_ofs;
@@ -4613,7 +4615,7 @@ next_sg:
qc->cursg_ofs += count;
if (qc->cursg_ofs == sg->length) {
- qc->cursg++;
+ qc->cursg = sg_next(qc->cursg);
qc->cursg_ofs = 0;
}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 7906d75..8fad10e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -30,7 +30,7 @@
#include <linux/interrupt.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#include <linux/io.h>
#include <linux/ata.h>
#include <linux/workqueue.h>
@@ -388,6 +388,7 @@ struct ata_queued_cmd {
unsigned long flags; /* ATA_QCFLAG_xxx */
unsigned int tag;
unsigned int n_elem;
+ unsigned int n_iter;
unsigned int orig_n_elem;
int dma_dir;
@@ -398,7 +399,7 @@ struct ata_queued_cmd {
unsigned int nbytes;
unsigned int curbytes;
- unsigned int cursg;
+ struct scatterlist *cursg;
unsigned int cursg_ofs;
struct scatterlist sgent;
@@ -935,7 +936,7 @@ ata_sg_is_last(struct scatterlist *sg, struct ata_queued_cmd *qc)
return 1;
if (qc->pad_len)
return 0;
- if (((sg - qc->__sg) + 1) == qc->n_elem)
+ if (qc->n_iter == qc->n_elem)
return 1;
return 0;
}
@@ -943,6 +944,7 @@ ata_sg_is_last(struct scatterlist *sg, struct ata_queued_cmd *qc)
static inline struct scatterlist *
ata_qc_first_sg(struct ata_queued_cmd *qc)
{
+ qc->n_iter = 0;
if (qc->n_elem)
return qc->__sg;
if (qc->pad_len)
@@ -955,8 +957,8 @@ ata_qc_next_sg(struct scatterlist *sg, struct ata_queued_cmd *qc)
{
if (sg == &qc->pad_sgent)
return NULL;
- if (++sg - qc->__sg < qc->n_elem)
- return sg;
+ if (++qc->n_iter < qc->n_elem)
+ return sg_next(sg);
if (qc->pad_len)
return &qc->pad_sgent;
return NULL;
@@ -1157,9 +1159,11 @@ static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
qc->dma_dir = DMA_NONE;
qc->__sg = NULL;
qc->flags = 0;
- qc->cursg = qc->cursg_ofs = 0;
+ qc->cursg = NULL;
+ qc->cursg_ofs = 0;
qc->nbytes = qc->curbytes = 0;
qc->n_elem = 0;
+ qc->n_iter = 0;
qc->err_mask = 0;
qc->pad_len = 0;
qc->sect_size = ATA_SECT_SIZE;
--
1.5.2.rc1
Convert the main rq mapper (blk_rq_map_sg()) to the sg helper setup.
Signed-off-by: Jens Axboe <[email protected]>
---
block/ll_rw_blk.c | 19 ++++++++++++-------
1 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 17e1889..b01a5f2 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -30,6 +30,7 @@
#include <linux/cpu.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
+#include <linux/scatterlist.h>
/*
* for max sense size
@@ -1307,9 +1308,11 @@ static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
-int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg)
+int blk_rq_map_sg(request_queue_t *q, struct request *rq,
+ struct scatterlist *sglist)
{
struct bio_vec *bvec, *bvprv;
+ struct scatterlist *next_sg, *sg;
struct bio *bio;
int nsegs, i, cluster;
@@ -1320,6 +1323,7 @@ int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg
* for each bio in rq
*/
bvprv = NULL;
+ sg = next_sg = &sglist[0];
rq_for_each_bio(bio, rq) {
/*
* for each segment in bio
@@ -1328,7 +1332,7 @@ int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg
int nbytes = bvec->bv_len;
if (bvprv && cluster) {
- if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
+ if (sg->length + nbytes > q->max_segment_size)
goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -1336,14 +1340,15 @@ int blk_rq_map_sg(request_queue_t *q, struct request *rq, struct scatterlist *sg
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
goto new_segment;
- sg[nsegs - 1].length += nbytes;
+ sg->length += nbytes;
} else {
new_segment:
- memset(&sg[nsegs],0,sizeof(struct scatterlist));
- sg[nsegs].page = bvec->bv_page;
- sg[nsegs].length = nbytes;
- sg[nsegs].offset = bvec->bv_offset;
+ sg = next_sg;
+ next_sg = sg_next(sg);
+ sg->page = bvec->bv_page;
+ sg->length = nbytes;
+ sg->offset = bvec->bv_offset;
nsegs++;
}
bvprv = bvec;
--
1.5.2.rc1
Signed-off-by: Jens Axboe <[email protected]>
---
include/asm-x86_64/dma-mapping.h | 3 +--
include/asm-x86_64/scatterlist.h | 2 ++
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h
index 6897e2a..ecd0f61 100644
--- a/include/asm-x86_64/dma-mapping.h
+++ b/include/asm-x86_64/dma-mapping.h
@@ -6,8 +6,7 @@
* documentation.
*/
-
-#include <asm/scatterlist.h>
+#include <linux/scatterlist.h>
#include <asm/io.h>
#include <asm/swiotlb.h>
diff --git a/include/asm-x86_64/scatterlist.h b/include/asm-x86_64/scatterlist.h
index eaf7ada..ef3986b 100644
--- a/include/asm-x86_64/scatterlist.h
+++ b/include/asm-x86_64/scatterlist.h
@@ -11,6 +11,8 @@ struct scatterlist {
unsigned int dma_length;
};
+#define ARCH_HAS_SG_CHAIN
+
#define ISA_DMA_THRESHOLD (0x00ffffff)
/* These macros should be used after a pci_map_sg call has been done
--
1.5.2.rc1
The core of the patch - allow the last sg element in a scatterlist
table to point to the start of a new table. We overload the LSB of
the page pointer to indicate whether this is a valid sg entry, or
merely a link to the next list.
Signed-off-by: Jens Axboe <[email protected]>
---
include/asm-i386/scatterlist.h | 2 +
include/linux/scatterlist.h | 52 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 52 insertions(+), 2 deletions(-)
diff --git a/include/asm-i386/scatterlist.h b/include/asm-i386/scatterlist.h
index d7e45a8..bd5164a 100644
--- a/include/asm-i386/scatterlist.h
+++ b/include/asm-i386/scatterlist.h
@@ -10,6 +10,8 @@ struct scatterlist {
unsigned int length;
};
+#define ARCH_HAS_SG_CHAIN
+
/* These macros should be used after a pci_map_sg call has been done
* to get bus addresses of each of the SG entries and their lengths.
* You should only work with the number of sg entries pci_map_sg
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index bed5ab4..fa2dc1c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -20,8 +20,23 @@ static inline void sg_init_one(struct scatterlist *sg, const void *buf,
sg_set_buf(sg, buf, buflen);
}
-#define sg_next(sg) ((sg) + 1)
-#define sg_last(sg, nents) (&(sg[(nents) - 1]))
+#define sg_is_chain(sg) ((unsigned long) (sg)->page & 0x01)
+#define sg_chain_ptr(sg) \
+ ((struct scatterlist *) ((unsigned long) (sg)->page & ~0x01))
+
+/*
+ * We overload the meaning of ->page for sg chaining. If the LSB is
+ * set, the page member contains a pointer to the next sgtable.
+ */
+static inline struct scatterlist *sg_next(struct scatterlist *sg)
+{
+ sg++;
+
+ if (unlikely(sg_is_chain(sg)))
+ sg = sg_chain_ptr(sg);
+
+ return sg;
+}
/*
* Loop over each sg element, following the pointer to a new list if necessary
@@ -29,4 +44,37 @@ static inline void sg_init_one(struct scatterlist *sg, const void *buf,
#define for_each_sg(sglist, sg, nr, __i) \
for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+/*
+ * We could improve this by passing in the maximum size of an sglist, so
+ * we could jump directly to the last table. That would eliminate this
+ * (potentially) lengthy scan.
+ */
+static inline struct scatterlist *sg_last(struct scatterlist *sgl,
+ unsigned int nents)
+{
+#ifdef ARCH_HAS_SG_CHAIN
+ struct scatterlist *ret = &sgl[nents - 1];
+#else
+ struct scatterlist *sg, *ret = NULL;
+ int i;
+
+ for_each_sg(sgl, sg, nents, i)
+ ret = sg;
+
+#endif
+ return ret;
+}
+
+/*
+ * Chain previous sglist to this one
+ */
+static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+ struct scatterlist *sgl)
+{
+#ifndef ARCH_HAS_SG_CHAIN
+ BUG();
+#endif
+ prv[prv_nents - 1].page = (struct page *) ((unsigned long) sgl | 0x01);
+}
+
#endif /* _LINUX_SCATTERLIST_H */
--
1.5.2.rc1
Just pass in the command, no point in passing in the scatterlist
and scatterlist pool index seperately.
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/scsi/scsi_lib.c | 9 +++++----
drivers/scsi/scsi_tgt_lib.c | 4 ++--
include/scsi/scsi_cmnd.h | 2 +-
3 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f944690..26236b1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -745,13 +745,14 @@ struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
EXPORT_SYMBOL(scsi_alloc_sgtable);
-void scsi_free_sgtable(struct scatterlist *sgl, int index)
+void scsi_free_sgtable(struct scsi_cmnd *cmd)
{
+ struct scatterlist *sgl = cmd->request_buffer;
struct scsi_host_sg_pool *sgp;
- BUG_ON(index >= SG_MEMPOOL_NR);
+ BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);
- sgp = scsi_sg_pools + index;
+ sgp = scsi_sg_pools + cmd->sglist_len;
mempool_free(sgl, sgp->pool);
}
@@ -777,7 +778,7 @@ EXPORT_SYMBOL(scsi_free_sgtable);
static void scsi_release_buffers(struct scsi_cmnd *cmd)
{
if (cmd->use_sg)
- scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+ scsi_free_sgtable(cmd);
/*
* Zero these out. They now point to freed memory, and it is
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 2570f48..d6e58e5 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -329,7 +329,7 @@ static void scsi_tgt_cmd_done(struct scsi_cmnd *cmd)
scsi_tgt_uspace_send_status(cmd, tcmd->tag);
if (cmd->request_buffer)
- scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+ scsi_free_sgtable(cmd);
queue_work(scsi_tgtd, &tcmd->work);
}
@@ -370,7 +370,7 @@ static int scsi_tgt_init_cmd(struct scsi_cmnd *cmd, gfp_t gfp_mask)
}
eprintk("cmd %p cnt %d\n", cmd, cmd->use_sg);
- scsi_free_sgtable(cmd->request_buffer, cmd->sglist_len);
+ scsi_free_sgtable(cmd);
return -EINVAL;
}
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index a2e0c10..d7db992 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -133,6 +133,6 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
extern void scsi_kunmap_atomic_sg(void *virt);
extern struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *, gfp_t);
-extern void scsi_free_sgtable(struct scatterlist *, int);
+extern void scsi_free_sgtable(struct scsi_cmnd *);
#endif /* _SCSI_SCSI_CMND_H */
--
1.5.2.rc1
Expose this setting for now, so that users can play with enabling
large commands without defaulting it to on globally. This is a debug
patch, it will be dropped for the final versions.
Signed-off-by: Jens Axboe <[email protected]>
---
block/ll_rw_blk.c | 23 +++++++++++++++++++++++
1 files changed, 23 insertions(+), 0 deletions(-)
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index b01a5f2..18f1db0 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3930,7 +3930,23 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
return queue_var_show(max_hw_sectors_kb, (page));
}
+static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(q->max_phys_segments, page);
+}
+
+static ssize_t queue_max_segments_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long segments;
+ ssize_t ret = queue_var_store(&segments, page, count);
+ spin_lock_irq(q->queue_lock);
+ q->max_phys_segments = segments;
+ spin_unlock_irq(q->queue_lock);
+
+ return ret;
+}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -3954,6 +3970,12 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
.show = queue_max_hw_sectors_show,
};
+static struct queue_sysfs_entry queue_max_segments_entry = {
+ .attr = {.name = "max_segments", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_max_segments_show,
+ .store = queue_max_segments_store,
+};
+
static struct queue_sysfs_entry queue_iosched_entry = {
.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
.show = elv_iosched_show,
@@ -3965,6 +3987,7 @@ static struct attribute *default_attrs[] = {
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
+ &queue_max_segments_entry.attr,
&queue_iosched_entry.attr,
NULL,
};
--
1.5.2.rc1
This is what enables large commands. If we need to allocate an
sgtable that doesn't fit in a single page, allocate several
SCSI_MAX_SG_SEGMENTS sized tables and chain them together.
We default to the safe setup of NOT chaining, for now.
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/scsi/scsi_lib.c | 204 +++++++++++++++++++++++++++++++++++-----------
include/scsi/scsi.h | 7 --
include/scsi/scsi_cmnd.h | 1 +
3 files changed, 158 insertions(+), 54 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 26236b1..2b36c09 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -29,39 +29,26 @@
#include "scsi_priv.h"
#include "scsi_logging.h"
+#include <linux/scatterlist.h>
#define SG_MEMPOOL_NR ARRAY_SIZE(scsi_sg_pools)
#define SG_MEMPOOL_SIZE 2
struct scsi_host_sg_pool {
size_t size;
- char *name;
+ char *name;
struct kmem_cache *slab;
mempool_t *pool;
};
-#if (SCSI_MAX_PHYS_SEGMENTS < 32)
-#error SCSI_MAX_PHYS_SEGMENTS is too small
-#endif
-
-#define SP(x) { x, "sgpool-" #x }
+#define SP(x) { x, "sgpool-" #x }
static struct scsi_host_sg_pool scsi_sg_pools[] = {
SP(8),
SP(16),
SP(32),
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
SP(64),
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
SP(128),
-#if (SCSI_MAX_PHYS_SEGMENTS > 128)
- SP(256),
-#if (SCSI_MAX_PHYS_SEGMENTS > 256)
-#error SCSI_MAX_PHYS_SEGMENTS is too large
-#endif
-#endif
-#endif
-#endif
-};
+};
#undef SP
static void scsi_run_queue(struct request_queue *q);
@@ -702,45 +689,126 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int uptodate,
return NULL;
}
-struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
-{
- struct scsi_host_sg_pool *sgp;
- struct scatterlist *sgl;
+/*
+ * The maximum number of SG segments that we will put inside a scatterlist
+ * (unless chaining is used). Should ideally fit inside a single page, to
+ * avoid a higher order allocation.
+ */
+#define SCSI_MAX_SG_SEGMENTS 128
- BUG_ON(!cmd->use_sg);
+static inline unsigned int scsi_sgtable_index(unsigned short nents)
+{
+ unsigned int index;
- switch (cmd->use_sg) {
+ switch (nents) {
case 1 ... 8:
- cmd->sglist_len = 0;
+ index = 0;
break;
case 9 ... 16:
- cmd->sglist_len = 1;
+ index = 1;
break;
case 17 ... 32:
- cmd->sglist_len = 2;
+ index = 2;
break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 32)
case 33 ... 64:
- cmd->sglist_len = 3;
+ index = 3;
break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 64)
- case 65 ... 128:
- cmd->sglist_len = 4;
+ case 65 ... SCSI_MAX_SG_SEGMENTS:
+ index = 4;
break;
-#if (SCSI_MAX_PHYS_SEGMENTS > 128)
- case 129 ... 256:
- cmd->sglist_len = 5;
- break;
-#endif
-#endif
-#endif
default:
- return NULL;
+ printk(KERN_ERR "scsi: bad segment count=%d\n", nents);
+ BUG();
}
- sgp = scsi_sg_pools + cmd->sglist_len;
- sgl = mempool_alloc(sgp->pool, gfp_mask);
- return sgl;
+ return index;
+}
+
+struct scatterlist *scsi_alloc_sgtable(struct scsi_cmnd *cmd, gfp_t gfp_mask)
+{
+ struct scsi_host_sg_pool *sgp;
+ struct scatterlist *sgl, *prev, *ret;
+ unsigned int index;
+ int this, left;
+
+ BUG_ON(!cmd->use_sg);
+
+ left = cmd->use_sg;
+ ret = prev = NULL;
+ do {
+ this = left;
+ if (this > SCSI_MAX_SG_SEGMENTS) {
+ this = SCSI_MAX_SG_SEGMENTS;
+ index = SG_MEMPOOL_NR - 1;
+ } else
+ index = scsi_sgtable_index(this);
+
+ left -= this;
+
+ /*
+ * if we have more entries after this round, reserve a slot
+ * for the chain pointer.
+ */
+ if (left)
+ left++;
+
+ sgp = scsi_sg_pools + index;
+
+ sgl = mempool_alloc(sgp->pool, gfp_mask);
+ if (unlikely(!sgl))
+ goto enomem;
+
+ memset(sgl, 0, sizeof(*sgl) * sgp->size);
+
+ /*
+ * first loop through, set initial index and return value
+ */
+ if (!ret) {
+ cmd->sglist_len = index;
+ ret = sgl;
+ }
+
+ /*
+ * chain previous sglist, if any. we know the previous
+ * sglist must be the biggest one, or we would not have
+ * ended up doing another loop.
+ */
+ if (prev)
+ sg_chain(prev, SCSI_MAX_SG_SEGMENTS, sgl);
+
+ /*
+ * don't allow subsequent mempool allocs to sleep, it would
+ * violate the mempool principle.
+ */
+ gfp_mask &= ~__GFP_WAIT;
+ gfp_mask |= __GFP_HIGH;
+ prev = sgl;
+ } while (left);
+
+ /*
+ * ->use_sg may get modified after dma mapping has potentially
+ * shrunk the number of segments, so keep a copy of it for free.
+ */
+ cmd->__use_sg = cmd->use_sg;
+ return ret;
+enomem:
+ if (ret) {
+ /*
+ * Free entries chained off ret. Since we were trying to
+ * allocate another sglist, we know that all entries are of
+ * the max size.
+ */
+ sgp = scsi_sg_pools + SG_MEMPOOL_NR - 1;
+ prev = &ret[SCSI_MAX_SG_SEGMENTS - 1];
+
+ while ((sgl = sg_chain_ptr(ret)) != NULL) {
+ ret = &sgl[SCSI_MAX_SG_SEGMENTS - 1];
+ mempool_free(sgl, sgp->pool);
+ }
+
+ mempool_free(prev, sgp->pool);
+ }
+ return NULL;
}
EXPORT_SYMBOL(scsi_alloc_sgtable);
@@ -752,6 +820,42 @@ void scsi_free_sgtable(struct scsi_cmnd *cmd)
BUG_ON(cmd->sglist_len >= SG_MEMPOOL_NR);
+ /*
+ * if this is the biggest size sglist, check if we have
+ * chained parts we need to free
+ */
+ if (cmd->__use_sg > SCSI_MAX_SG_SEGMENTS) {
+ unsigned short this, left;
+ struct scatterlist *next;
+ unsigned int index;
+
+ left = cmd->__use_sg - SCSI_MAX_SG_SEGMENTS;
+ next = sg_chain_ptr(&sgl[SCSI_MAX_SG_SEGMENTS - 1]);
+ do {
+ sgl = next;
+ this = left;
+ if (this > SCSI_MAX_SG_SEGMENTS) {
+ this = SCSI_MAX_SG_SEGMENTS;
+ index = SG_MEMPOOL_NR - 1;
+ } else
+ index = scsi_sgtable_index(this);
+
+ left -= this;
+
+ sgp = scsi_sg_pools + index;
+
+ if (left)
+ next = sg_chain_ptr(&sgl[sgp->size - 1]);
+
+ mempool_free(sgl, sgp->pool);
+ } while (left);
+
+ /*
+ * Restore original, will be freed below
+ */
+ sgl = cmd->request_buffer;
+ }
+
sgp = scsi_sg_pools + cmd->sglist_len;
mempool_free(sgl, sgp->pool);
}
@@ -993,7 +1097,6 @@ EXPORT_SYMBOL(scsi_io_completion);
static int scsi_init_io(struct scsi_cmnd *cmd)
{
struct request *req = cmd->request;
- struct scatterlist *sgpnt;
int count;
/*
@@ -1006,14 +1109,13 @@ static int scsi_init_io(struct scsi_cmnd *cmd)
/*
* If sg table allocation fails, requeue request later.
*/
- sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
- if (unlikely(!sgpnt)) {
+ cmd->request_buffer = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
+ if (unlikely(!cmd->request_buffer)) {
scsi_unprep_request(req);
return BLKPREP_DEFER;
}
req->buffer = NULL;
- cmd->request_buffer = (char *) sgpnt;
if (blk_pc_request(req))
cmd->request_bufflen = req->data_len;
else
@@ -1577,8 +1679,16 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
if (!q)
return NULL;
+ /*
+ * this limit is imposed by hardware restrictions
+ */
blk_queue_max_hw_segments(q, shost->sg_tablesize);
- blk_queue_max_phys_segments(q, SCSI_MAX_PHYS_SEGMENTS);
+
+ /*
+ * we can chain scatterlists, so this limit is fairly arbitrary
+ */
+ blk_queue_max_phys_segments(q, SCSI_MAX_SG_SEGMENTS);
+
blk_queue_max_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
blk_queue_segment_boundary(q, shost->dma_boundary);
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 9f8f80a..702fcfe 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -11,13 +11,6 @@
#include <linux/types.h>
/*
- * The maximum sg list length SCSI can cope with
- * (currently must be a power of 2 between 32 and 256)
- */
-#define SCSI_MAX_PHYS_SEGMENTS MAX_PHYS_SEGMENTS
-
-
-/*
* SCSI command lengths
*/
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index d7db992..fc649af 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -72,6 +72,7 @@ struct scsi_cmnd {
/* These elements define the operation we ultimately want to perform */
unsigned short use_sg; /* Number of pieces of scatter-gather */
unsigned short sglist_len; /* size of malloc'd scatter-gather list */
+ unsigned short __use_sg;
unsigned underflow; /* Return error if less than
this amount is transferred */
--
1.5.2.rc1
This prepares x86-64 for sg chaining support.
igned-off-by: Jens Axboe <[email protected]>
---
arch/x86_64/kernel/pci-calgary.c | 25 ++++++++++++---------
arch/x86_64/kernel/pci-gart.c | 44 ++++++++++++++++++++++---------------
arch/x86_64/kernel/pci-nommu.c | 5 ++-
3 files changed, 43 insertions(+), 31 deletions(-)
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
index 5bd20b5..c472b14 100644
--- a/arch/x86_64/kernel/pci-calgary.c
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -35,6 +35,7 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/delay.h>
+#include <linux/scatterlist.h>
#include <asm/proto.h>
#include <asm/calgary.h>
#include <asm/tce.h>
@@ -341,17 +342,19 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
static void __calgary_unmap_sg(struct iommu_table *tbl,
struct scatterlist *sglist, int nelems, int direction)
{
- while (nelems--) {
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sglist, s, nelems, i) {
unsigned int npages;
- dma_addr_t dma = sglist->dma_address;
- unsigned int dmalen = sglist->dma_length;
+ dma_addr_t dma = s->dma_address;
+ unsigned int dmalen = s->dma_length;
if (dmalen == 0)
break;
npages = num_dma_pages(dma, dmalen);
__iommu_free(tbl, dma, npages);
- sglist++;
}
}
@@ -374,10 +377,10 @@ void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
static int calgary_nontranslate_map_sg(struct device* dev,
struct scatterlist *sg, int nelems, int direction)
{
+ struct scatterlist *s;
int i;
- for (i = 0; i < nelems; i++ ) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nelems, i) {
BUG_ON(!s->page);
s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
s->dma_length = s->length;
@@ -389,6 +392,7 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg,
int nelems, int direction)
{
struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+ struct scatterlist *s;
unsigned long flags;
unsigned long vaddr;
unsigned int npages;
@@ -400,8 +404,7 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg,
spin_lock_irqsave(&tbl->it_lock, flags);
- for (i = 0; i < nelems; i++ ) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nelems, i) {
BUG_ON(!s->page);
vaddr = (unsigned long)page_address(s->page) + s->offset;
@@ -428,9 +431,9 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg,
return nelems;
error:
__calgary_unmap_sg(tbl, sg, nelems, direction);
- for (i = 0; i < nelems; i++) {
- sg[i].dma_address = bad_dma_address;
- sg[i].dma_length = 0;
+ for_each_sg(sg, s, nelems, i) {
+ s->dma_address = bad_dma_address;
+ s->dma_length = 0;
}
spin_unlock_irqrestore(&tbl->it_lock, flags);
return 0;
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 373ef66..8bc2ed7 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -23,6 +23,7 @@
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/kdebug.h>
+#include <linux/scatterlist.h>
#include <asm/atomic.h>
#include <asm/io.h>
#include <asm/mtrr.h>
@@ -277,10 +278,10 @@ void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
*/
void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
{
+ struct scatterlist *s;
int i;
- for (i = 0; i < nents; i++) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
@@ -291,14 +292,14 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di
static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
int nents, int dir)
{
+ struct scatterlist *s;
int i;
#ifdef CONFIG_IOMMU_DEBUG
printk(KERN_DEBUG "dma_map_sg overflow\n");
#endif
- for (i = 0; i < nents; i++ ) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nents, i) {
unsigned long addr = page_to_phys(s->page) + s->offset;
if (nonforced_iommu(dev, addr, s->length)) {
addr = dma_map_area(dev, addr, s->length, dir);
@@ -323,13 +324,17 @@ static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
{
unsigned long iommu_start = alloc_iommu(pages);
unsigned long iommu_page = iommu_start;
- int i;
+ struct scatterlist *s;
+ int i, nelems;
if (iommu_start == -1)
return -1;
+
+ nelems = stopat - start;
+ while (start--)
+ sg = sg_next(sg);
- for (i = start; i < stopat; i++) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nelems, i) {
unsigned long pages, addr;
unsigned long phys_addr = s->dma_address;
@@ -360,12 +365,14 @@ static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
struct scatterlist *sout,
unsigned long pages, int need)
{
- if (!need) {
+ if (!need) {
BUG_ON(stopat - start != 1);
- *sout = sg[start];
- sout->dma_length = sg[start].length;
+ while (--start)
+ sg = sg_next(sg);
+ *sout = *sg;
+ sout->dma_length = sg->length;
return 0;
- }
+ }
return __dma_map_cont(sg, start, stopat, sout, pages);
}
@@ -380,6 +387,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
int start;
unsigned long pages = 0;
int need = 0, nextneed;
+ struct scatterlist *s, *ps;
if (nents == 0)
return 0;
@@ -389,8 +397,8 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
out = 0;
start = 0;
- for (i = 0; i < nents; i++) {
- struct scatterlist *s = &sg[i];
+ ps = NULL; /* shut up gcc */
+ for_each_sg(sg, s, nents, i) {
dma_addr_t addr = page_to_phys(s->page) + s->offset;
s->dma_address = addr;
BUG_ON(s->length == 0);
@@ -399,7 +407,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
/* Handle the previous not yet processed entries */
if (i > start) {
- struct scatterlist *ps = &sg[i-1];
/* Can only merge when the last chunk ends on a page
boundary and the new one doesn't have an offset. */
if (!iommu_merge || !nextneed || !need || s->offset ||
@@ -415,13 +422,14 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
need = nextneed;
pages += to_pages(s->offset, s->length);
+ ps = s;
}
if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
goto error;
out++;
flush_gart();
- if (out < nents)
- sg[out].dma_length = 0;
+ if (out < nents)
+ ps->dma_length = 0;
return out;
error:
@@ -436,8 +444,8 @@ error:
if (panic_on_overflow)
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
- for (i = 0; i < nents; i++)
- sg[i].dma_address = bad_dma_address;
+ for_each_sg(sg, s, nents, i)
+ s->dma_address = bad_dma_address;
return 0;
}
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 6dade0c..24c9faf 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -5,6 +5,7 @@
#include <linux/pci.h>
#include <linux/string.h>
#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
#include <asm/proto.h>
#include <asm/processor.h>
@@ -57,10 +58,10 @@ void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
int nents, int direction)
{
+ struct scatterlist *s;
int i;
- for (i = 0; i < nents; i++ ) {
- struct scatterlist *s = &sg[i];
+ for_each_sg(sg, s, nents, i) {
BUG_ON(!s->page);
s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
--
1.5.2.rc1
This converts the SCSI mid layer to using the sg helpers for looking up
sg elements, instead of doing it manually.
Signed-off-by: Jens Axboe <[email protected]>
---
drivers/scsi/scsi_lib.c | 20 +++++++++++---------
1 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f5a07b..f944690 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -302,14 +302,15 @@ static int scsi_req_map_sg(struct request *rq, struct scatterlist *sgl,
struct request_queue *q = rq->q;
int nr_pages = (bufflen + sgl[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned int data_len = 0, len, bytes, off;
+ struct scatterlist *sg;
struct page *page;
struct bio *bio = NULL;
int i, err, nr_vecs = 0;
- for (i = 0; i < nsegs; i++) {
- page = sgl[i].page;
- off = sgl[i].offset;
- len = sgl[i].length;
+ for_each_sg(sgl, sg, nsegs, i) {
+ page = sg->page;
+ off = sg->offset;
+ len = sg->length;
data_len += len;
while (len > 0) {
@@ -2240,18 +2241,19 @@ EXPORT_SYMBOL_GPL(scsi_target_unblock);
*
* Returns virtual address of the start of the mapped page
*/
-void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
+void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count,
size_t *offset, size_t *len)
{
int i;
size_t sg_len = 0, len_complete = 0;
+ struct scatterlist *sg;
struct page *page;
WARN_ON(!irqs_disabled());
- for (i = 0; i < sg_count; i++) {
+ for_each_sg(sgl, sg, sg_count, i) {
len_complete = sg_len; /* Complete sg-entries */
- sg_len += sg[i].length;
+ sg_len += sg->length;
if (sg_len > *offset)
break;
}
@@ -2265,10 +2267,10 @@ void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
}
/* Offset starting from the beginning of first page in this sg-entry */
- *offset = *offset - len_complete + sg[i].offset;
+ *offset = *offset - len_complete + sg->offset;
/* Assumption: contiguous pages can be accessed as "page + i" */
- page = nth_page(sg[i].page, (*offset >> PAGE_SHIFT));
+ page = nth_page(sg->page, (*offset >> PAGE_SHIFT));
*offset &= ~PAGE_MASK;
/* Bytes in this sg-entry from *offset to the end of the page */
--
1.5.2.rc1
The dma mapping helpers need to be converted to using
sg helpers as well, so they will work with a chained
sglist setup.
Signed-off-by: Jens Axboe <[email protected]>
---
include/asm-i386/dma-mapping.h | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index 183eebe..a956ec1 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -2,10 +2,10 @@
#define _ASM_I386_DMA_MAPPING_H
#include <linux/mm.h>
+#include <linux/scatterlist.h>
#include <asm/cache.h>
#include <asm/io.h>
-#include <asm/scatterlist.h>
#include <asm/bug.h>
#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -35,18 +35,19 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
}
static inline int
-dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
enum dma_data_direction direction)
{
+ struct scatterlist *sg;
int i;
BUG_ON(!valid_dma_direction(direction));
- WARN_ON(nents == 0 || sg[0].length == 0);
+ WARN_ON(nents == 0 || sglist[0].length == 0);
- for (i = 0; i < nents; i++ ) {
- BUG_ON(!sg[i].page);
+ for_each_sg(sglist, sg, nents, i) {
+ BUG_ON(!sg->page);
- sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset;
+ sg->dma_address = page_to_phys(sg->page) + sg->offset;
}
flush_write_buffers();
--
1.5.2.rc1
Hi Jens,
On 5/10/07, Jens Axboe <[email protected]> wrote:
> The core of the patch - allow the last sg element in a scatterlist
> table to point to the start of a new table. We overload the LSB of
> the page pointer to indicate whether this is a valid sg entry, or
> merely a link to the next list.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> include/asm-i386/scatterlist.h | 2 +
> include/linux/scatterlist.h | 52 ++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 52 insertions(+), 2 deletions(-)
>
> diff --git a/include/asm-i386/scatterlist.h b/include/asm-i386/scatterlist.h
> index d7e45a8..bd5164a 100644
> --- a/include/asm-i386/scatterlist.h
> +++ b/include/asm-i386/scatterlist.h
> @@ -10,6 +10,8 @@ struct scatterlist {
> unsigned int length;
> };
>
> +#define ARCH_HAS_SG_CHAIN
> +
> /* These macros should be used after a pci_map_sg call has been done
> * to get bus addresses of each of the SG entries and their lengths.
> * You should only work with the number of sg entries pci_map_sg
> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> index bed5ab4..fa2dc1c 100644
> --- a/include/linux/scatterlist.h
> +++ b/include/linux/scatterlist.h
> [...]
> +/*
> + * We could improve this by passing in the maximum size of an sglist, so
> + * we could jump directly to the last table. That would eliminate this
> + * (potentially) lengthy scan.
> + */
> +static inline struct scatterlist *sg_last(struct scatterlist *sgl,
> + unsigned int nents)
> +{
> +#ifdef ARCH_HAS_SG_CHAIN
> + struct scatterlist *ret = &sgl[nents - 1];
> +#else
> + struct scatterlist *sg, *ret = NULL;
> + int i;
> +
> + for_each_sg(sgl, sg, nents, i)
> + ret = sg;
> +
> +#endif
> + return ret;
> +}
> +
> +/*
> + * Chain previous sglist to this one
> + */
> +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
> + struct scatterlist *sgl)
> +{
> +#ifndef ARCH_HAS_SG_CHAIN
> + BUG();
> +#endif
> + prv[prv_nents - 1].page = (struct page *) ((unsigned long) sgl | 0x01);
> +}
> +
> #endif /* _LINUX_SCATTERLIST_H */
(Style triviality)
In include/linux/scatterlist.h, why not simply:
#ifdef ARCH_HAS_SG_CHAIN
> +/*
> + * We could improve this by passing in the maximum size of an sglist, so
> + * we could jump directly to the last table. That would eliminate this
> + * (potentially) lengthy scan.
> + */
> +static inline struct scatterlist *sg_last(struct scatterlist *sgl,
> + unsigned int nents)
> +{
> + struct scatterlist *ret = &sgl[nents - 1];
> + return ret;
> +}
> +
> +/*
> + * Chain previous sglist to this one
> + */
> +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
> + struct scatterlist *sgl)
> +{
> + prv[prv_nents - 1].page = (struct page *) ((unsigned long) sgl | 0x01);
> +}
#else
> +/*
> + * We could improve this by passing in the maximum size of an sglist, so
> + * we could jump directly to the last table. That would eliminate this
> + * (potentially) lengthy scan.
> + */
> +static inline struct scatterlist *sg_last(struct scatterlist *sgl,
> + unsigned int nents)
> +{
> + struct scatterlist *sg, *ret = NULL;
> + int i;
> +
> + for_each_sg(sgl, sg, nents, i)
> + ret = sg;
> +
> + return ret;
> +}
> +
> +/*
> + * Chain previous sglist to this one
> + */
> +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
> + struct scatterlist *sgl)
> +{
> + BUG();
> +}
#endif /* ARCH_HAS_SG_CHAIN */
Similar to most of the other headers I've seen, and avoids multiple
usage if #ifdef / #ifndef.
On 5/10/07, Jens Axboe <[email protected]> wrote:
> [...]
> +static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
> + struct scatterlist *sgl)
> +{
> +#ifndef ARCH_HAS_SG_CHAIN
> + BUG();
> +#endif
Hmmm ... so we better also make sure all users call this from within
an "#ifdef ARCH_HAS_SG_CHAIN" themselves (or else make this a config
option defined in the arch/.../defconfig of archs that support this
and then make the calling code's Kconfig option "depends on ... &&
ARCH_FOO").
(Just saw a previous thread, Andrew seems to have beaten me to this observation)
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 8bc2ed7..48ce635 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -319,27 +319,23 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
}
/* Map multiple scatterlist entries continuous into the first. */
-static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
+static int __dma_map_cont(struct scatterlist *start, int nelems,
struct scatterlist *sout, unsigned long pages)
{
unsigned long iommu_start = alloc_iommu(pages);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
- int i, nelems;
+ int i;
if (iommu_start == -1)
return -1;
- nelems = stopat - start;
- while (start--)
- sg = sg_next(sg);
-
- for_each_sg(sg, s, nelems, i) {
+ for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
unsigned long phys_addr = s->dma_address;
- BUG_ON(i > start && s->offset);
- if (i == start) {
+ BUG_ON(s != start && s->offset);
+ if (s == start) {
*sout = *s;
sout->dma_address = iommu_bus_base;
sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
@@ -361,19 +357,17 @@ static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
return 0;
}
-static inline int dma_map_cont(struct scatterlist *sg, int start, int stopat,
+static inline int dma_map_cont(struct scatterlist *start, int nelems,
struct scatterlist *sout,
unsigned long pages, int need)
{
if (!need) {
- BUG_ON(stopat - start != 1);
- while (--start)
- sg = sg_next(sg);
- *sout = *sg;
- sout->dma_length = sg->length;
+ BUG_ON(nelems != 1);
+ *sout = *start;
+ sout->dma_length = start->length;
return 0;
}
- return __dma_map_cont(sg, start, stopat, sout, pages);
+ return __dma_map_cont(start, nelems, sout, pages);
}
/*
@@ -387,7 +381,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
int start;
unsigned long pages = 0;
int need = 0, nextneed;
- struct scatterlist *s, *ps;
+ struct scatterlist *s, *ps, *start_sg;
if (nents == 0)
return 0;
@@ -397,6 +391,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
out = 0;
start = 0;
+ start_sg = sg;
ps = NULL; /* shut up gcc */
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = page_to_phys(s->page) + s->offset;
@@ -411,12 +406,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
boundary and the new one doesn't have an offset. */
if (!iommu_merge || !nextneed || !need || s->offset ||
(ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(sg, start, i, sg+out, pages,
- need) < 0)
+ if (dma_map_cont(start_sg, i - start, sg+out,
+ pages, need) < 0)
goto error;
out++;
pages = 0;
- start = i;
+ start = i;
+ start_sg = s;
}
}
@@ -424,7 +420,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
pages += to_pages(s->offset, s->length);
ps = s;
}
- if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
+ if (dma_map_cont(start_sg, i - start, sg+out, pages, need) < 0)
goto error;
out++;
flush_gart();
Jens Axboe wrote:
> It's a subsystem function, prefix it as such.
Jens, Boaz and I talked about this over lunch.
I wonder whether the crypto code must use your implementation
instead of its own as it needs to over the sglist, e.g. for
calculating iscsi (data) digest.
The crypto implementation of chained sglists in crypto/scatterwalk.h
determines the chain link by !sg->length which will sorta work
with your implementation, however the marker bit on page pointer must
be cleared to use it.
Also, is it possible that after the original sglist has gone through
dma_map_sg and entries were merged, some entries will have zero
length? I'm not sure... If so, if the crypto implementation scans
the sg list after it was dma mapped (maybe in a retry path) it
may hit an entry that looks to it like a chaining link. This
might be an existing bug and another reason for the crypto code
to use your implementation.
Thanks,
Benny
On Thu, May 10 2007, Benny Halevy wrote:
> Jens Axboe wrote:
> > It's a subsystem function, prefix it as such.
>
> Jens, Boaz and I talked about this over lunch.
> I wonder whether the crypto code must use your implementation
> instead of its own as it needs to over the sglist, e.g. for
> calculating iscsi (data) digest.
The thought did cross my mind, and yes I think that would be a good
idea. The whole thing should probably just migrate to
lib/scattersomething.c
> The crypto implementation of chained sglists in crypto/scatterwalk.h
> determines the chain link by !sg->length which will sorta work
> with your implementation, however the marker bit on page pointer must
> be cleared to use it.
I don't like using sg->length, as that may be modified for legitimate
reason. That's why I chose to use the lsb bit of the page pointer.
> Also, is it possible that after the original sglist has gone through
> dma_map_sg and entries were merged, some entries will have zero
> length? I'm not sure... If so, if the crypto implementation scans
> the sg list after it was dma mapped (maybe in a retry path) it
> may hit an entry that looks to it like a chaining link. This
> might be an existing bug and another reason for the crypto code
> to use your implementation.
It's hard to say, depends heavily on the sub system or arch. Even if
using the pointer tagging mechanism seems a bit nasty, I think it's the
more resilient approach.
--
Jens Axboe
On Thu, May 10 2007, Benny Halevy wrote:
> Jens Axboe wrote:
> <snip>
> > @@ -323,13 +324,17 @@ static int __dma_map_cont(struct scatterlist *sg, int start, int stopat,
> > {
> > unsigned long iommu_start = alloc_iommu(pages);
> > unsigned long iommu_page = iommu_start;
> > - int i;
> > + struct scatterlist *s;
> > + int i, nelems;
> >
> > if (iommu_start == -1)
> > return -1;
> > +
> > + nelems = stopat - start;
> > + while (start--)
> > + sg = sg_next(sg);
>
> Ouch. This will suck if we merge many times in a long list.
> How about keeping and passing start as a struct scatterlist * rather
> than an index? (see attached example, (compiles, bu untested though)
It will suck indeed, I like your patch! I'll review and incorporate.
Your other points are good, noted.
--
Jens Axboe
Jens Axboe wrote:
> On Thu, May 10 2007, Benny Halevy wrote:
>> Jens Axboe wrote:
>>> It's a subsystem function, prefix it as such.
>> Jens, Boaz and I talked about this over lunch.
>> I wonder whether the crypto code must use your implementation
>> instead of its own as it needs to over the sglist, e.g. for
>> calculating iscsi (data) digest.
>
> The thought did cross my mind, and yes I think that would be a good
> idea. The whole thing should probably just migrate to
> lib/scattersomething.c
>
>> The crypto implementation of chained sglists in crypto/scatterwalk.h
>> determines the chain link by !sg->length which will sorta work
>> with your implementation, however the marker bit on page pointer must
>> be cleared to use it.
>
> I don't like using sg->length, as that may be modified for legitimate
> reason. That's why I chose to use the lsb bit of the page pointer.
>
>> Also, is it possible that after the original sglist has gone through
>> dma_map_sg and entries were merged, some entries will have zero
>> length? I'm not sure... If so, if the crypto implementation scans
>> the sg list after it was dma mapped (maybe in a retry path) it
>> may hit an entry that looks to it like a chaining link. This
>> might be an existing bug and another reason for the crypto code
>> to use your implementation.
>
> It's hard to say, depends heavily on the sub system or arch. Even if
> using the pointer tagging mechanism seems a bit nasty, I think it's the
> more resilient approach.
>
We're in agreement then :)
I was trying to say that the methods should be compatible, otherwise
bugs can happen, and that your scheme is better since it can
handle sglists with zero length entries that aren't the last.
A case that might be valid after dma mapping and merging.
If indeed this case is possible, this seems to be the right time
to converge to your scheme.
Benny
Benny Halevy <[email protected]> wrote:
>
> I was trying to say that the methods should be compatible, otherwise
> bugs can happen, and that your scheme is better since it can
> handle sglists with zero length entries that aren't the last.
> A case that might be valid after dma mapping and merging.
> If indeed this case is possible, this seems to be the right time
> to converge to your scheme.
Well right now this isn't possible because the crypto layer is not
directly hooked to any DMA code since it's (mostly) software only.
However, I completely agree that it should be converted to this new
scheme. The only user of chaining right now is crypto/hmac.c so it
should be easy to fix.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[email protected]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
On Thu, May 10 2007, Benny Halevy wrote:
> @@ -411,12 +406,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
> boundary and the new one doesn't have an offset. */
> if (!iommu_merge || !nextneed || !need || s->offset ||
> (ps->offset + ps->length) % PAGE_SIZE) {
> - if (dma_map_cont(sg, start, i, sg+out, pages,
> - need) < 0)
> + if (dma_map_cont(start_sg, i - start, sg+out,
> + pages, need) < 0)
> goto error;
> out++;
> pages = 0;
> - start = i;
> + start = i;
> + start_sg = s;
> }
> }
>
> @@ -424,7 +420,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
> pages += to_pages(s->offset, s->length);
> ps = s;
> }
> - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
> + if (dma_map_cont(start_sg, i - start, sg+out, pages, need) < 0)
> goto error;
> out++;
> flush_gart();
Your patch is (very) buggy, the whole premise of doing chained sg
entries makes sg + int illegal!
--
Jens Axboe
Jens Axboe wrote:
> On Thu, May 10 2007, Benny Halevy wrote:
>> @@ -411,12 +406,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
>> boundary and the new one doesn't have an offset. */
>> if (!iommu_merge || !nextneed || !need || s->offset ||
>> (ps->offset + ps->length) % PAGE_SIZE) {
>> - if (dma_map_cont(sg, start, i, sg+out, pages,
>> - need) < 0)
>> + if (dma_map_cont(start_sg, i - start, sg+out,
>> + pages, need) < 0)
>> goto error;
>> out++;
>> pages = 0;
>> - start = i;
>> + start = i;
>> + start_sg = s;
>> }
>> }
>>
>> @@ -424,7 +420,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
>> pages += to_pages(s->offset, s->length);
>> ps = s;
>> }
>> - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
>> + if (dma_map_cont(start_sg, i - start, sg+out, pages, need) < 0)
>> goto error;
>> out++;
>> flush_gart();
>
> Your patch is (very) buggy, the whole premise of doing chained sg
> entries makes sg + int illegal!
>
You're right. I'll send a fix asap.
Benny
--
Benny Halevy
Software Architect
Tel/Fax: +972-3-647-8340
Mobile: +972-54-802-8340
[email protected]
Panasas, Inc.
The Leader in Parallel Storage
http://www.panasas.com
On Mon, May 21 2007, Benny Halevy wrote:
> Jens Axboe wrote:
> > On Thu, May 10 2007, Benny Halevy wrote:
> >> @@ -411,12 +406,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
> >> boundary and the new one doesn't have an offset. */
> >> if (!iommu_merge || !nextneed || !need || s->offset ||
> >> (ps->offset + ps->length) % PAGE_SIZE) {
> >> - if (dma_map_cont(sg, start, i, sg+out, pages,
> >> - need) < 0)
> >> + if (dma_map_cont(start_sg, i - start, sg+out,
> >> + pages, need) < 0)
> >> goto error;
> >> out++;
> >> pages = 0;
> >> - start = i;
> >> + start = i;
> >> + start_sg = s;
> >> }
> >> }
> >>
> >> @@ -424,7 +420,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
> >> pages += to_pages(s->offset, s->length);
> >> ps = s;
> >> }
> >> - if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
> >> + if (dma_map_cont(start_sg, i - start, sg+out, pages, need) < 0)
> >> goto error;
> >> out++;
> >> flush_gart();
> >
> > Your patch is (very) buggy, the whole premise of doing chained sg
> > entries makes sg + int illegal!
> >
>
> You're right. I'll send a fix asap.
Did you see this one, I cooked it up after sending that mail out. Please
review :-)
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 2e22a3a..b16384f 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -381,7 +381,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
int start;
unsigned long pages = 0;
int need = 0, nextneed;
- struct scatterlist *s, *ps, *start_sg;
+ struct scatterlist *s, *ps, *start_sg, *sgmap;
if (nents == 0)
return 0;
@@ -391,7 +391,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
out = 0;
start = 0;
- start_sg = sg;
+ start_sg = sgmap = sg;
ps = NULL; /* shut up gcc */
for_each_sg(sg, s, nents, i) {
dma_addr_t addr = page_to_phys(s->page) + s->offset;
@@ -405,11 +405,12 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
/* Can only merge when the last chunk ends on a page
boundary and the new one doesn't have an offset. */
if (!iommu_merge || !nextneed || !need || s->offset ||
- (ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(start_sg, i - start, sg+out,
+ (ps->offset + ps->length) % PAGE_SIZE) {
+ if (dma_map_cont(start_sg, i - start, sgmap,
pages, need) < 0)
goto error;
out++;
+ sgmap = sg_next(sgmap);
pages = 0;
start = i;
start_sg = s;
@@ -420,7 +421,7 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
pages += to_pages(s->offset, s->length);
ps = s;
}
- if (dma_map_cont(start_sg, i - start, sg+out, pages, need) < 0)
+ if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0)
goto error;
out++;
flush_gart();
--
Jens Axboe