2009-03-02 06:33:26

by FUJITA Tomonori

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

On Fri, 27 Feb 2009 17:09:27 -0600
Mike Miller <[email protected]> wrote:

> This patch is a scsi based driver for the HP Smart Array controllers. It
> will eventually replace the block driver called cciss. At his time there is

Superb! This is what I've been waiting for.


> +/*define how many times we will try a command because of bus resets */
> +#define MAX_CMD_RETRIES 3
> +#define MAX_CTLR 32
> +
> +/* Embedded module documentation macros - see modules.h */
> +MODULE_AUTHOR("Hewlett-Packard Company");
> +MODULE_DESCRIPTION("Driver for HP Smart Array Controller version 1.0.0");
> +MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
> +MODULE_VERSION("1.0.0");
> +MODULE_LICENSE("GPL");
> +
> +static int allow_unknown_smartarray;
> +module_param(allow_unknown_smartarray, int, S_IRUGO|S_IWUSR);
> +MODULE_PARM_DESC(allow_unknown_smartarray,
> + "Allow driver to load on unknown HP Smart Array hardware");
> +
> +/* define the PCI info for the cards we can control */
> +static const struct pci_device_id hpsa_pci_device_id[] = {
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3223},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3234},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324a},
> + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324b},
> + {PCI_VENDOR_ID_HP, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
> + PCI_CLASS_STORAGE_RAID << 8, 0xffff << 8, 0},
> + {0,}
> +};
> +
> +MODULE_DEVICE_TABLE(pci, hpsa_pci_device_id);
> +
> +/* board_id = Subsystem Device ID & Vendor ID
> + * product = Marketing Name for the board
> + * access = Address of the struct of function pointers
> + */
> +static struct board_type products[] = {
> + {0x3223103C, "Smart Array P800", &SA5_access},
> + {0x3234103C, "Smart Array P400", &SA5_access},
> + {0x323d103c, "Smart Array P700M", &SA5_access},
> + {0x3241103C, "Smart Array P212", &SA5_access},
> + {0x3243103C, "Smart Array P410", &SA5_access},
> + {0x3245103C, "Smart Array P410i", &SA5_access},
> + {0x3247103C, "Smart Array P411", &SA5_access},
> + {0x3249103C, "Smart Array P812", &SA5_access},
> + {0x324a103C, "Smart Array P712m", &SA5_access},
> + {0x324b103C, "Smart Array P711m", &SA5_access},
> + {0xFFFF103C, "Unknown Smart Array", &SA5_access},
> +};
> +
> +static struct ctlr_info *hba[MAX_CTLR];

Do we really need this static array? Allocating struct ctlr_info
dynamically is fine?


> +static irqreturn_t do_hpsa_intr(int irq, void *dev_id);
> +static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg);
> +static void start_io(struct ctlr_info *h);
> +static int sendcmd(__u8 cmd, struct ctlr_info *h, void *buff, size_t size,
> + __u8 page_code, unsigned char *scsi3addr, int cmd_type);
> +
> +#ifdef CONFIG_COMPAT
> +static int hpsa_compat_ioctl(struct scsi_device *dev, int cmd, void *arg);
> +#endif
> +
> +static void cmd_free(struct ctlr_info *h, struct CommandList_struct *c);
> +static void cmd_special_free(struct ctlr_info *h, struct CommandList_struct *c);
> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h);
> +static struct CommandList_struct *cmd_special_alloc(struct ctlr_info *h);
> +
> +static int hpsa_scsi_proc_info(
> + struct Scsi_Host *sh,
> + char *buffer, /* data buffer */
> + char **start, /* where data in buffer starts */
> + off_t offset, /* offset from start of imaginary file */
> + int length, /* length of data in buffer */
> + int func); /* 0 == read, 1 == write */
> +
> +static int hpsa_scsi_queue_command(struct scsi_cmnd *cmd,
> + void (*done)(struct scsi_cmnd *));
> +
> +static int hpsa_eh_device_reset_handler(struct scsi_cmnd *scsicmd);
> +
> +static struct scsi_host_template hpsa_driver_template = {
> + .module = THIS_MODULE,
> + .name = "hpsa",
> + .proc_name = "hpsa",
> + .proc_info = hpsa_scsi_proc_info,
> + .queuecommand = hpsa_scsi_queue_command,
> + .can_queue = 512,
> + .this_id = -1,
> + .sg_tablesize = MAXSGENTRIES,

MAXSGENTRIES (32) is the limitation of hardware? If not, it might be
better to enlarge this for better performance?


> + .cmd_per_lun = 512,
> + .use_clustering = DISABLE_CLUSTERING,

Why can we use ENABLE_CLUSTERING here? We would get the better
performance with ENABLE_CLUSTERING.


> + .eh_device_reset_handler = hpsa_eh_device_reset_handler,
> + .ioctl = hpsa_ioctl,
> +#ifdef CONFIG_COMPAT
> + .compat_ioctl = hpsa_compat_ioctl,
> +#endif
> +};
> +
> +/* Enqueuing and dequeuing functions for cmdlists. */
> +static inline void addQ(struct hlist_head *list, struct CommandList_struct *c)
> +{
> + hlist_add_head(&c->list, list);
> +}
> +
> +static inline void removeQ(struct CommandList_struct *c)
> +{
> + if (WARN_ON(hlist_unhashed(&c->list)))
> + return;
> + hlist_del_init(&c->list);
> +}
> +
> +static inline int bit_is_set(__u8 bitarray[], int bit)
> +{
> + return bitarray[bit >> 3] & (1 << (bit & 0x07));
> +}
> +
> +static inline void set_bit_in_array(__u8 bitarray[], int bit)
> +{
> + bitarray[bit >> 3] |= (1 << (bit & 0x07));
> +}

Can not we use the standard bit operation functions instead?


> +/* hpsa_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci
> + dma mapping and fills in the scatter gather entries of the
> + hpsa command, cp. */
> +
> +static void hpsa_scatter_gather(struct pci_dev *pdev,
> + struct CommandList_struct *cp,
> + struct scsi_cmnd *cmd)
> +{
> + unsigned int len;
> + struct scatterlist *sg;
> + __u64 addr64;
> + int use_sg, i;
> +
> + BUG_ON(scsi_sg_count(cmd) > MAXSGENTRIES);
> +
> + use_sg = scsi_dma_map(cmd);
> + if (!use_sg)
> + goto sglist_finished;

We need to handle dma mapping failure here; scsi_dma_map could fail.


> + scsi_for_each_sg(cmd, sg, use_sg, i) {
> + addr64 = (__u64) sg_dma_address(sg);
> + len = sg_dma_len(sg);
> + cp->SG[i].Addr.lower =
> + (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF);
> + cp->SG[i].Addr.upper =
> + (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF);
> + cp->SG[i].Len = len;
> + cp->SG[i].Ext = 0; /* we are not chaining */
> + }
> +
> +sglist_finished:
> +
> + cp->Header.SGList = (__u8) use_sg; /* no. SGs contig in this cmd */
> + cp->Header.SGTotal = (__u16) use_sg; /* total sgs in this cmd list */
> + return;
> +}
> +
> +
> +static int hpsa_scsi_queue_command(struct scsi_cmnd *cmd,
> + void (*done)(struct scsi_cmnd *))
> +{
> + struct ctlr_info *h;
> + int rc;
> + unsigned char scsi3addr[8];
> + struct CommandList_struct *cp;
> + unsigned long flags;
> +
> + /* Get the ptr to our adapter structure (hba[i]) out of cmd->host. */
> + h = (struct ctlr_info *) cmd->device->host->hostdata[0];

Let's use shost_priv().


> + rc = lookup_scsi3addr(h, cmd->device->channel, cmd->device->id,
> + cmd->device->lun, scsi3addr);
> + if (rc != 0) {
> + /* the scsi nexus does not match any that we presented... */
> + /* pretend to mid layer that we got selection timeout */
> + cmd->result = DID_NO_CONNECT << 16;
> + done(cmd);
> + /* we might want to think about registering controller itself
> + as a processor device on the bus so sg binds to it. */
> + return 0;
> + }
> +
> + /* Ok, we have a reasonable scsi nexus, so send the cmd down, and
> + see what the device thinks of it. */
> +
> + /* Need a lock as this is being allocated from the pool */
> + spin_lock_irqsave(&h->lock, flags);
> + cp = cmd_alloc(h);
> + spin_unlock_irqrestore(&h->lock, flags);
> + if (cp == NULL) { /* trouble... */

We run out of commands here. Returning SCSI_MLQUEUE_HOST_BUSY is
appropriate here, I think.

But if we allocate shost->can_queue at startup, we can't run out of
commands.


> +/* Get us a file in /proc/hpsa that says something about each controller.
> + * Create /proc/hpsa if it doesn't exist yet. */
> +static void __devinit hpsa_procinit(struct ctlr_info *h)
> +{
> + struct proc_dir_entry *pde;
> +
> + if (proc_hpsa == NULL)
> + proc_hpsa = proc_mkdir("driver/hpsa", NULL);
> + if (!proc_hpsa)
> + return;
> + pde = proc_create_data(h->devname, S_IRUSR | S_IRGRP | S_IROTH,
> + proc_hpsa, &hpsa_proc_fops, h);
> +}
> +#endif /* CONFIG_PROC_FS */

We really need this? Creating something under /proc is not good. Using
/sys/class/scsi_host/ is the proper way. If we remove the overlap
between hpsa and cciss, we can do the proper way, I think.


> +/*
> + * For operations that cannot sleep, a command block is allocated at init,
> + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
> + * which ones are free or in use. Lock must be held when calling this.
> + * cmd_free() is the complement.
> + */
> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h)
> +{
> + struct CommandList_struct *c;
> + int i;
> + union u64bit temp64;
> + dma_addr_t cmd_dma_handle, err_dma_handle;
> +
> + do {
> + i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
> + if (i == h->nr_cmds)
> + return NULL;
> + } while (test_and_set_bit
> + (i & (BITS_PER_LONG - 1),
> + h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);

Using bitmap to manage free commands looks too complicated a bit to
me. Can we just use lists for command management?


2009-03-02 17:19:35

by Grant Grundler

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

On Sun, Mar 1, 2009 at 10:32 PM, FUJITA Tomonori
<[email protected]> wrote:
...
>> +/*
>> + * For operations that cannot sleep, a command block is allocated at init,
>> + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
>> + * which ones are free or in use.  Lock must be held when calling this.
>> + * cmd_free() is the complement.
>> + */
>> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h)
>> +{
>> +     struct CommandList_struct *c;
>> +     int i;
>> +     union u64bit temp64;
>> +     dma_addr_t cmd_dma_handle, err_dma_handle;
>> +
>> +     do {
>> +             i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
>> +             if (i == h->nr_cmds)
>> +                     return NULL;
>> +     } while (test_and_set_bit
>> +              (i & (BITS_PER_LONG - 1),
>> +               h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
>
> Using bitmap to manage free commands looks too complicated a bit to
> me. Can we just use lists for command management?

Bit maps are generally more efficient than lists since we touch less data.
For both search and moving elements from free<->busy lists. This probably
won't matter if we are talking less than 10K IOPS. And willy demonstrated
other layers have pretty high overhead (block, libata and SCSI midlayer)
at high transaction rates.

If nr_cmds can be greater than 8*BITS_PER_LONG or so, it would
be more efficient to save the allocation offset and start the next search
from that location. But I can't tell from the code since nr_cmds is
coming from the controller:

+ /* Query controller for max supported commands: */
+ c->max_commands = readl(&(c->cfgtable->CmdsOutMax));
...
+ c->nr_cmds = c->max_commands - 4;


hth,
grant

2009-03-02 18:37:14

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

On Mon, Mar 02 2009, Mike Christie wrote:
> Grant Grundler wrote:
>> On Sun, Mar 1, 2009 at 10:32 PM, FUJITA Tomonori
>> <[email protected]> wrote:
>> ...
>>>> +/*
>>>> + * For operations that cannot sleep, a command block is allocated at init,
>>>> + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
>>>> + * which ones are free or in use. Lock must be held when calling this.
>>>> + * cmd_free() is the complement.
>>>> + */
>>>> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h)
>>>> +{
>>>> + struct CommandList_struct *c;
>>>> + int i;
>>>> + union u64bit temp64;
>>>> + dma_addr_t cmd_dma_handle, err_dma_handle;
>>>> +
>>>> + do {
>>>> + i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
>>>> + if (i == h->nr_cmds)
>>>> + return NULL;
>>>> + } while (test_and_set_bit
>>>> + (i & (BITS_PER_LONG - 1),
>>>> + h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
>>> Using bitmap to manage free commands looks too complicated a bit to
>>> me. Can we just use lists for command management?
>>
>> Bit maps are generally more efficient than lists since we touch less data.
>> For both search and moving elements from free<->busy lists. This probably
>> won't matter if we are talking less than 10K IOPS. And willy demonstrated
>> other layers have pretty high overhead (block, libata and SCSI midlayer)
>> at high transaction rates.
>>
>
> If it was just needing this for the queuecommand path it would be
> simple. For the queuecommand path we could just use the scsi host
> tagging code for the index. You do not need a lock in the queuecommand
> path for getting a index and command, and you do not need to duplicate
> the tag/index allocation code in the block/scsi code
>
> A problem with the host tagging is what to do if you need a tag/index
> for a internal command. In the slow path like the device reset and cache
> flush case you could use a list or preallocated command or whatever
> other drivers are using that makes you happy.
>
> Or for the reset/shutdown/internal path could we come up with a
> extension to the existing API. Maybe just add some wrapper around some
> of blk_queue_start_tag that takes a the bqt (the bqt would come from the
> host wide one) and allocates the tag (need a something similar for the
> release side).

This is precisely what I did for libata, here is is interleaved with
some other stuff:

http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=f557570ec6042370333b6b9c33bbbae175120a89

It needs a little more polish and so on, but the concept is identical to
what you describe for this case. And I agree, it's much better to use
the same index instead of generating/maintaining seperate bitmaps for
this type of thing.

--
Jens Axboe

2009-03-02 19:23:31

by Mike Christie

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

Grant Grundler wrote:
> On Sun, Mar 1, 2009 at 10:32 PM, FUJITA Tomonori
> <[email protected]> wrote:
> ...
>>> +/*
>>> + * For operations that cannot sleep, a command block is allocated at init,
>>> + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
>>> + * which ones are free or in use. Lock must be held when calling this.
>>> + * cmd_free() is the complement.
>>> + */
>>> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h)
>>> +{
>>> + struct CommandList_struct *c;
>>> + int i;
>>> + union u64bit temp64;
>>> + dma_addr_t cmd_dma_handle, err_dma_handle;
>>> +
>>> + do {
>>> + i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
>>> + if (i == h->nr_cmds)
>>> + return NULL;
>>> + } while (test_and_set_bit
>>> + (i & (BITS_PER_LONG - 1),
>>> + h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
>> Using bitmap to manage free commands looks too complicated a bit to
>> me. Can we just use lists for command management?
>
> Bit maps are generally more efficient than lists since we touch less data.
> For both search and moving elements from free<->busy lists. This probably
> won't matter if we are talking less than 10K IOPS. And willy demonstrated
> other layers have pretty high overhead (block, libata and SCSI midlayer)
> at high transaction rates.
>

If it was just needing this for the queuecommand path it would be
simple. For the queuecommand path we could just use the scsi host
tagging code for the index. You do not need a lock in the queuecommand
path for getting a index and command, and you do not need to duplicate
the tag/index allocation code in the block/scsi code

A problem with the host tagging is what to do if you need a tag/index
for a internal command. In the slow path like the device reset and cache
flush case you could use a list or preallocated command or whatever
other drivers are using that makes you happy.

Or for the reset/shutdown/internal path could we come up with a
extension to the existing API. Maybe just add some wrapper around some
of blk_queue_start_tag that takes a the bqt (the bqt would come from the
host wide one) and allocates the tag (need a something similar for the
release side).

2009-03-02 20:34:40

by Mike Christie

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

diff --git a/block/blk-tag.c b/block/blk-tag.c
index 3c518e3..0614faf 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(blk_queue_free_tags);
static int
init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
{
- struct request **tag_index;
+ void **tag_index;
unsigned long *tag_map;
int nr_ulongs;

@@ -116,7 +116,7 @@ init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
__func__, depth);
}

- tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+ tag_index = kzalloc(depth * sizeof(void *), GFP_ATOMIC);
if (!tag_index)
goto fail;

@@ -219,7 +219,7 @@ EXPORT_SYMBOL(blk_queue_init_tags);
int blk_queue_resize_tags(struct request_queue *q, int new_depth)
{
struct blk_queue_tag *bqt = q->queue_tags;
- struct request **tag_index;
+ void **tag_index;
unsigned long *tag_map;
int max_depth, nr_ulongs;

@@ -254,7 +254,7 @@ int blk_queue_resize_tags(struct request_queue *q, int new_depth)
if (init_tag_map(q, bqt, new_depth))
return -ENOMEM;

- memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+ memcpy(bqt->tag_index, tag_index, max_depth * sizeof(void *));
nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));

@@ -265,24 +265,12 @@ int blk_queue_resize_tags(struct request_queue *q, int new_depth)
EXPORT_SYMBOL(blk_queue_resize_tags);

/**
- * blk_queue_end_tag - end tag operations for a request
- * @q: the request queue for the device
- * @rq: the request that has completed
- *
- * Description:
- * Typically called when end_that_request_first() returns %0, meaning
- * all transfers have been done for a request. It's important to call
- * this function before end_that_request_last(), as that will put the
- * request back on the free list thus corrupting the internal tag list.
- *
- * Notes:
- * queue lock must be held.
+ * blk_map_end_tag - end tag operation
+ * @bqt: block queue tag
+ * @tag: tag to clear
**/
-void blk_queue_end_tag(struct request_queue *q, struct request *rq)
+void blk_map_end_tag(struct blk_queue_tag *bqt, int tag)
{
- struct blk_queue_tag *bqt = q->queue_tags;
- int tag = rq->tag;
-
BUG_ON(tag == -1);

if (unlikely(tag >= bqt->real_max_depth))
@@ -292,10 +280,6 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
*/
return;

- list_del_init(&rq->queuelist);
- rq->cmd_flags &= ~REQ_QUEUED;
- rq->tag = -1;
-
if (unlikely(bqt->tag_index[tag] == NULL))
printk(KERN_ERR "%s: tag %d is missing\n",
__func__, tag);
@@ -313,9 +297,65 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
*/
clear_bit_unlock(tag, bqt->tag_map);
}
+EXPORT_SYMBOL(blk_map_end_tag);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q: the request queue for the device
+ * @rq: the request that has completed
+ *
+ * Description:
+ * Typically called when end_that_request_first() returns %0, meaning
+ * all transfers have been done for a request. It's important to call
+ * this function before end_that_request_last(), as that will put the
+ * request back on the free list thus corrupting the internal tag list.
+ *
+ * Notes:
+ * queue lock must be held.
+ **/
+void blk_queue_end_tag(struct request_queue *q, struct request *rq)
+{
+ blk_map_end_tag(q->queue_tags, rq->tag);
+
+ list_del_init(&rq->queuelist);
+ rq->cmd_flags &= ~REQ_QUEUED;
+ rq->tag = -1;
+}
EXPORT_SYMBOL(blk_queue_end_tag);

/**
+ * blk_map_start_tag - find a free tag
+ * @bqt: block queue tag
+ * @object: object to store in bqt tag_index at index returned by tag
+ * @offset: offset into bqt tag map
+ **/
+int blk_map_start_tag(struct blk_queue_tag *bqt, void *object, unsigned offset)
+{
+ unsigned max_depth;
+ int tag;
+
+ /*
+ * Protect against shared tag maps, as we may not have exclusive
+ * access to the tag map.
+ */
+ max_depth = bqt->max_depth;
+ do {
+ tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
+ if (tag >= max_depth)
+ return -1;
+
+ } while (test_and_set_bit_lock(tag, bqt->tag_map));
+ /*
+ * We need lock ordering semantics given by test_and_set_bit_lock.
+ * See blk_map_end_tag for details.
+ */
+
+ bqt->tag_index[tag] = object;
+ return tag;
+}
+EXPORT_SYMBOL(blk_map_start_tag);
+
+/**
* blk_queue_start_tag - find a free tag and assign it
* @q: the request queue for the device
* @rq: the block request that needs tagging
@@ -347,10 +387,8 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
BUG();
}

+
/*
- * Protect against shared tag maps, as we may not have exclusive
- * access to the tag map.
- *
* We reserve a few tags just for sync IO, since we don't want
* to starve sync IO on behalf of flooding async IO.
*/
@@ -360,20 +398,12 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
else
offset = max_depth >> 2;

- do {
- tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
- if (tag >= max_depth)
- return 1;
-
- } while (test_and_set_bit_lock(tag, bqt->tag_map));
- /*
- * We need lock ordering semantics given by test_and_set_bit_lock.
- * See blk_queue_end_tag for details.
- */
+ tag = blk_map_start_tag(bqt, rq, offset);
+ if (tag < 0)
+ return 1;

rq->cmd_flags |= REQ_QUEUED;
rq->tag = tag;
- bqt->tag_index[tag] = rq;
blkdev_dequeue_request(rq);
list_add(&rq->queuelist, &q->tag_busy_list);
return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 465d6ba..d748261 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -290,7 +290,7 @@ enum blk_queue_state {
};

struct blk_queue_tag {
- struct request **tag_index; /* map of busy tags */
+ void **tag_index; /* map of busy tags */
unsigned long *tag_map; /* bit map of free/busy tags */
int busy; /* current depth */
int max_depth; /* what we will send to device */
@@ -904,6 +904,8 @@ extern int blk_queue_resize_tags(struct request_queue *, int);
extern void blk_queue_invalidate_tags(struct request_queue *);
extern struct blk_queue_tag *blk_init_tags(int);
extern void blk_free_tags(struct blk_queue_tag *);
+extern int blk_map_start_tag(struct blk_queue_tag *, void *, unsigned);
+extern void blk_map_end_tag(struct blk_queue_tag *, int);

static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
int tag)


Attachments:
make-tagging-more-generic.patch (6.33 kB)

2009-03-02 20:38:26

by Mike Christie

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

Mike Christie wrote:
> iscsi also needs the unique tag and then it needs the
> blk_map_queue_find_tag functionality too. iscsi needs the lookup and tag
> for host/transport level commands that do not have a scsi
> command/request. The tag value has to be unique accross the
> host/transport

I mean that the tag needs to be unique for scsi/block commands and
transport commands for each host. So a scsi command and a transport
command cannot both have tag X.

2009-03-03 09:43:33

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH] hpsa: SCSI driver for HP Smart Array controllers

On Mon, Mar 02 2009, Mike Christie wrote:
> Jens Axboe wrote:
>> On Mon, Mar 02 2009, Mike Christie wrote:
>>> Grant Grundler wrote:
>>>> On Sun, Mar 1, 2009 at 10:32 PM, FUJITA Tomonori
>>>> <[email protected]> wrote:
>>>> ...
>>>>>> +/*
>>>>>> + * For operations that cannot sleep, a command block is allocated at init,
>>>>>> + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
>>>>>> + * which ones are free or in use. Lock must be held when calling this.
>>>>>> + * cmd_free() is the complement.
>>>>>> + */
>>>>>> +static struct CommandList_struct *cmd_alloc(struct ctlr_info *h)
>>>>>> +{
>>>>>> + struct CommandList_struct *c;
>>>>>> + int i;
>>>>>> + union u64bit temp64;
>>>>>> + dma_addr_t cmd_dma_handle, err_dma_handle;
>>>>>> +
>>>>>> + do {
>>>>>> + i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
>>>>>> + if (i == h->nr_cmds)
>>>>>> + return NULL;
>>>>>> + } while (test_and_set_bit
>>>>>> + (i & (BITS_PER_LONG - 1),
>>>>>> + h->cmd_pool_bits + (i / BITS_PER_LONG)) != 0);
>>>>> Using bitmap to manage free commands looks too complicated a bit to
>>>>> me. Can we just use lists for command management?
>>>> Bit maps are generally more efficient than lists since we touch less data.
>>>> For both search and moving elements from free<->busy lists. This probably
>>>> won't matter if we are talking less than 10K IOPS. And willy demonstrated
>>>> other layers have pretty high overhead (block, libata and SCSI midlayer)
>>>> at high transaction rates.
>>>>
>>> If it was just needing this for the queuecommand path it would be
>>> simple. For the queuecommand path we could just use the scsi host
>>> tagging code for the index. You do not need a lock in the
>>> queuecommand path for getting a index and command, and you do not
>>> need to duplicate the tag/index allocation code in the block/scsi
>>> code
>>>
>>> A problem with the host tagging is what to do if you need a tag/index
>>> for a internal command. In the slow path like the device reset and
>>> cache flush case you could use a list or preallocated command or
>>> whatever other drivers are using that makes you happy.
>>>
>>> Or for the reset/shutdown/internal path could we come up with a
>>> extension to the existing API. Maybe just add some wrapper around
>>> some of blk_queue_start_tag that takes a the bqt (the bqt would come
>>> from the host wide one) and allocates the tag (need a something
>>> similar for the release side).
>>
>> This is precisely what I did for libata, here is is interleaved with
>> some other stuff:
>>
>> http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff;h=f557570ec6042370333b6b9c33bbbae175120a89
>>
>> It needs a little more polish and so on, but the concept is identical to
>> what you describe for this case. And I agree, it's much better to use
>> the same index instead of generating/maintaining seperate bitmaps for
>> this type of thing.
>>
>
> In that patch where does the tag come from? Is it from libata?

This specific one is for libata which reserves an internal tag, hence it
just needs to wait for that. Splitting the tag map find/set/clear
functions as your patch does is perfectly doable, no problem with that.

>
> What if we wanted and/or needed the bqt to give us a tag value and we
> need it for the lookup? It looks like for hpsa we could kill its
> find_first_zero_bit code and use and use the code in blk_queue_start_tag.
>
> iscsi also needs the unique tag and then it needs the
> blk_map_queue_find_tag functionality too. iscsi needs the lookup and tag
> for host/transport level commands that do not have a scsi
> command/request. The tag value has to be unique accross the
> host/transport (acutally just the transport, but ignore that for now to
> make it simple and because for software iscsi we do a host per transport
> connection). Do you think something like the attached patch would be ok
> (it is only compile tested)?

> diff --git a/block/blk-tag.c b/block/blk-tag.c
> index 3c518e3..0614faf 100644
> --- a/block/blk-tag.c
> +++ b/block/blk-tag.c
> @@ -106,7 +106,7 @@ EXPORT_SYMBOL(blk_queue_free_tags);
> static int
> init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
> {
> - struct request **tag_index;
> + void **tag_index;
> unsigned long *tag_map;
> int nr_ulongs;
>
> @@ -116,7 +116,7 @@ init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
> __func__, depth);
> }
>
> - tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
> + tag_index = kzalloc(depth * sizeof(void *), GFP_ATOMIC);
> if (!tag_index)
> goto fail;
>
> @@ -219,7 +219,7 @@ EXPORT_SYMBOL(blk_queue_init_tags);
> int blk_queue_resize_tags(struct request_queue *q, int new_depth)
> {
> struct blk_queue_tag *bqt = q->queue_tags;
> - struct request **tag_index;
> + void **tag_index;
> unsigned long *tag_map;
> int max_depth, nr_ulongs;
>
> @@ -254,7 +254,7 @@ int blk_queue_resize_tags(struct request_queue *q, int new_depth)
> if (init_tag_map(q, bqt, new_depth))
> return -ENOMEM;
>
> - memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
> + memcpy(bqt->tag_index, tag_index, max_depth * sizeof(void *));
> nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
> memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
>
> @@ -265,24 +265,12 @@ int blk_queue_resize_tags(struct request_queue *q, int new_depth)
> EXPORT_SYMBOL(blk_queue_resize_tags);
>
> /**
> - * blk_queue_end_tag - end tag operations for a request
> - * @q: the request queue for the device
> - * @rq: the request that has completed
> - *
> - * Description:
> - * Typically called when end_that_request_first() returns %0, meaning
> - * all transfers have been done for a request. It's important to call
> - * this function before end_that_request_last(), as that will put the
> - * request back on the free list thus corrupting the internal tag list.
> - *
> - * Notes:
> - * queue lock must be held.
> + * blk_map_end_tag - end tag operation
> + * @bqt: block queue tag
> + * @tag: tag to clear
> **/
> -void blk_queue_end_tag(struct request_queue *q, struct request *rq)
> +void blk_map_end_tag(struct blk_queue_tag *bqt, int tag)
> {
> - struct blk_queue_tag *bqt = q->queue_tags;
> - int tag = rq->tag;
> -
> BUG_ON(tag == -1);
>
> if (unlikely(tag >= bqt->real_max_depth))
> @@ -292,10 +280,6 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
> */
> return;
>
> - list_del_init(&rq->queuelist);
> - rq->cmd_flags &= ~REQ_QUEUED;
> - rq->tag = -1;
> -
> if (unlikely(bqt->tag_index[tag] == NULL))
> printk(KERN_ERR "%s: tag %d is missing\n",
> __func__, tag);
> @@ -313,9 +297,65 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
> */
> clear_bit_unlock(tag, bqt->tag_map);
> }
> +EXPORT_SYMBOL(blk_map_end_tag);
> +
> +/**
> + * blk_queue_end_tag - end tag operations for a request
> + * @q: the request queue for the device
> + * @rq: the request that has completed
> + *
> + * Description:
> + * Typically called when end_that_request_first() returns %0, meaning
> + * all transfers have been done for a request. It's important to call
> + * this function before end_that_request_last(), as that will put the
> + * request back on the free list thus corrupting the internal tag list.
> + *
> + * Notes:
> + * queue lock must be held.
> + **/
> +void blk_queue_end_tag(struct request_queue *q, struct request *rq)
> +{
> + blk_map_end_tag(q->queue_tags, rq->tag);
> +
> + list_del_init(&rq->queuelist);
> + rq->cmd_flags &= ~REQ_QUEUED;
> + rq->tag = -1;
> +}
> EXPORT_SYMBOL(blk_queue_end_tag);
>
> /**
> + * blk_map_start_tag - find a free tag
> + * @bqt: block queue tag
> + * @object: object to store in bqt tag_index at index returned by tag
> + * @offset: offset into bqt tag map
> + **/
> +int blk_map_start_tag(struct blk_queue_tag *bqt, void *object, unsigned offset)
> +{
> + unsigned max_depth;
> + int tag;
> +
> + /*
> + * Protect against shared tag maps, as we may not have exclusive
> + * access to the tag map.
> + */
> + max_depth = bqt->max_depth;
> + do {
> + tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
> + if (tag >= max_depth)
> + return -1;
> +
> + } while (test_and_set_bit_lock(tag, bqt->tag_map));
> + /*
> + * We need lock ordering semantics given by test_and_set_bit_lock.
> + * See blk_map_end_tag for details.
> + */
> +
> + bqt->tag_index[tag] = object;
> + return tag;
> +}
> +EXPORT_SYMBOL(blk_map_start_tag);
> +
> +/**
> * blk_queue_start_tag - find a free tag and assign it
> * @q: the request queue for the device
> * @rq: the block request that needs tagging
> @@ -347,10 +387,8 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
> BUG();
> }
>
> +
> /*
> - * Protect against shared tag maps, as we may not have exclusive
> - * access to the tag map.
> - *
> * We reserve a few tags just for sync IO, since we don't want
> * to starve sync IO on behalf of flooding async IO.
> */
> @@ -360,20 +398,12 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
> else
> offset = max_depth >> 2;
>
> - do {
> - tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
> - if (tag >= max_depth)
> - return 1;
> -
> - } while (test_and_set_bit_lock(tag, bqt->tag_map));
> - /*
> - * We need lock ordering semantics given by test_and_set_bit_lock.
> - * See blk_queue_end_tag for details.
> - */
> + tag = blk_map_start_tag(bqt, rq, offset);
> + if (tag < 0)
> + return 1;
>
> rq->cmd_flags |= REQ_QUEUED;
> rq->tag = tag;
> - bqt->tag_index[tag] = rq;
> blkdev_dequeue_request(rq);
> list_add(&rq->queuelist, &q->tag_busy_list);
> return 0;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 465d6ba..d748261 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -290,7 +290,7 @@ enum blk_queue_state {
> };
>
> struct blk_queue_tag {
> - struct request **tag_index; /* map of busy tags */
> + void **tag_index; /* map of busy tags */
> unsigned long *tag_map; /* bit map of free/busy tags */
> int busy; /* current depth */
> int max_depth; /* what we will send to device */
> @@ -904,6 +904,8 @@ extern int blk_queue_resize_tags(struct request_queue *, int);
> extern void blk_queue_invalidate_tags(struct request_queue *);
> extern struct blk_queue_tag *blk_init_tags(int);
> extern void blk_free_tags(struct blk_queue_tag *);
> +extern int blk_map_start_tag(struct blk_queue_tag *, void *, unsigned);
> +extern void blk_map_end_tag(struct blk_queue_tag *, int);
>
> static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
> int tag)


--
Jens Axboe