2005-05-26 14:04:43

by Jens Axboe

[permalink] [raw]
Subject: Playing with SATA NCQ

Hi,

I was curious how well NCQ works on recent SATA disks (and controllers),
so I hacked in some basic support for NCQ in libata and ahci. First of
all, ahci is a really nice architecture, it was a joy to work with. NCQ
likewise, a pleasure to finally see some sane queueing facility for
SATA/ide!

Results are pretty damn nice, I easily get 30-50% faster random io read
performance without having to try hard.

Now, this patch is not complete. It should work and work well, but error
handling isn't really tested or finished yet (by any stretch of the
imagination). So don't use this on a production machine, it _should_ be
safe to use on your test boxes and for-fun workstations though (I run it
here...). I have tested on ich6 and ich7 generation ahci, and with
Maxtor and Seagate drives.

YMMV. If you try it, do let me know.

Index: drivers/scsi/ahci.c
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/ahci.c (mode:100644)
+++ uncommitted/drivers/scsi/ahci.c (mode:100644)
@@ -19,8 +19,8 @@
* If you do not delete the provisions above, a recipient may use your
* version of this file under either the OSL or the GPL.
*
- * Version 1.0 of the AHCI specification:
- * http://www.intel.com/technology/serialata/pdf/rev1_0.pdf
+ * Version 1.1 of the AHCI specification:
+ * http://www.intel.com/technology/serialata/pdf/rev1_1.pdf
*
*/

@@ -46,10 +46,13 @@
AHCI_MAX_SG = 168, /* hardware max is 64K */
AHCI_DMA_BOUNDARY = 0xffffffff,
AHCI_USE_CLUSTERING = 0,
- AHCI_CMD_SLOT_SZ = 32 * 32,
- AHCI_RX_FIS_SZ = 256,
+ AHCI_MAX_CMDS = 32,
+ AHCI_CMD_SZ = 32,
AHCI_CMD_TBL_HDR = 0x80,
- AHCI_CMD_TBL_SZ = AHCI_CMD_TBL_HDR + (AHCI_MAX_SG * 16),
+ AHCI_CMD_SLOT_SZ = AHCI_MAX_CMDS * AHCI_CMD_SZ,
+ AHCI_RX_FIS_SZ = 256,
+ AHCI_CMD_TOTAL = AHCI_CMD_TBL_HDR + (AHCI_MAX_SG * 16),
+ AHCI_CMD_TBL_SZ = AHCI_MAX_CMDS * AHCI_CMD_TOTAL,
AHCI_PORT_PRIV_DMA_SZ = AHCI_CMD_SLOT_SZ + AHCI_CMD_TBL_SZ +
AHCI_RX_FIS_SZ,
AHCI_IRQ_ON_SG = (1 << 31),
@@ -57,6 +60,7 @@
AHCI_CMD_WRITE = (1 << 6),

RX_FIS_D2H_REG = 0x40, /* offset of D2H Register FIS data */
+ RX_FIS_SDB_REG = 0x58, /* offset of SDB Register FIS data */

board_ahci = 0,

@@ -74,6 +78,7 @@

/* HOST_CAP bits */
HOST_CAP_64 = (1 << 31), /* PCI DAC (64-bit DMA) support */
+ HOST_CAP_NCQ = (1 << 30), /* Native Command Queueing */

/* registers for each SATA port */
PORT_LST_ADDR = 0x00, /* command list DMA addr */
@@ -161,9 +166,9 @@
dma_addr_t cmd_slot_dma;
void *cmd_tbl;
dma_addr_t cmd_tbl_dma;
- struct ahci_sg *cmd_tbl_sg;
void *rx_fis;
dma_addr_t rx_fis_dma;
+ u32 sactive;
};

static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg);
@@ -181,7 +186,7 @@
static void ahci_qc_prep(struct ata_queued_cmd *qc);
static u8 ahci_check_status(struct ata_port *ap);
static u8 ahci_check_err(struct ata_port *ap);
-static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
+static inline int ahci_host_intr(struct ata_port *ap);

static Scsi_Host_Template ahci_sht = {
.module = THIS_MODULE,
@@ -189,7 +194,7 @@
.ioctl = ata_scsi_ioctl,
.queuecommand = ata_scsi_queuecmd,
.eh_strategy_handler = ata_scsi_error,
- .can_queue = ATA_DEF_QUEUE,
+ .can_queue = ATA_MAX_QUEUE,
.this_id = ATA_SHT_THIS_ID,
.sg_tablesize = AHCI_MAX_SG,
.max_sectors = ATA_MAX_SECTORS,
@@ -200,7 +205,7 @@
.dma_boundary = AHCI_DMA_BOUNDARY,
.slave_configure = ata_scsi_slave_config,
.bios_param = ata_std_bios_param,
- .ordered_flush = 1,
+ .ordered_flush = 0, /* conflicts with NCQ for now */
};

static struct ata_port_operations ahci_ops = {
@@ -339,14 +344,11 @@
mem_dma += AHCI_RX_FIS_SZ;

/*
- * Third item: data area for storing a single command
- * and its scatter-gather table
+ * Third item: data area for storing commands and their sg tables
*/
pp->cmd_tbl = mem;
pp->cmd_tbl_dma = mem_dma;

- pp->cmd_tbl_sg = mem + AHCI_CMD_TBL_HDR;
-
ap->private_data = pp;

if (hpriv->cap & HOST_CAP_64)
@@ -478,9 +480,10 @@
ata_tf_from_fis(d2h_fis, tf);
}

-static void ahci_fill_sg(struct ata_queued_cmd *qc)
+static void ahci_fill_sg(struct ata_queued_cmd *qc, int offset)
{
struct ahci_port_priv *pp = qc->ap->private_data;
+ struct ahci_sg *cmd_tbl_sg;
unsigned int i;

VPRINTK("ENTER\n");
@@ -488,6 +491,7 @@
/*
* Next, the S/G list.
*/
+ cmd_tbl_sg = pp->cmd_tbl + offset + AHCI_CMD_TBL_HDR;
for (i = 0; i < qc->n_elem; i++) {
u32 sg_len;
dma_addr_t addr;
@@ -495,21 +499,30 @@
addr = sg_dma_address(&qc->sg[i]);
sg_len = sg_dma_len(&qc->sg[i]);

- pp->cmd_tbl_sg[i].addr = cpu_to_le32(addr & 0xffffffff);
- pp->cmd_tbl_sg[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
- pp->cmd_tbl_sg[i].flags_size = cpu_to_le32(sg_len - 1);
+ cmd_tbl_sg[i].addr = cpu_to_le32(addr & 0xffffffff);
+ cmd_tbl_sg[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
+ cmd_tbl_sg[i].flags_size = cpu_to_le32(sg_len - 1);
}
}

static void ahci_qc_prep(struct ata_queued_cmd *qc)
{
struct ahci_port_priv *pp = qc->ap->private_data;
- u32 opts;
+ void *port_mmio = (void *) qc->ap->ioaddr.cmd_addr;
const u32 cmd_fis_len = 5; /* five dwords */
+ dma_addr_t cmd_tbl_dma;
+ u32 opts;
+ int offset;
+
+ if (qc->flags & ATA_QCFLAG_NCQ) {
+ pp->sactive |= (1 << qc->tag);
+
+ writel(1 << qc->tag, port_mmio + PORT_SCR_ACT);
+ readl(port_mmio + PORT_SCR_ACT); /* flush */
+ }

/*
- * Fill in command slot information (currently only one slot,
- * slot 0, is currently since we don't do queueing)
+ * Fill in command slot information
*/

opts = (qc->n_elem << 16) | cmd_fis_len;
@@ -528,21 +541,28 @@
break;
}

- pp->cmd_slot[0].opts = cpu_to_le32(opts);
- pp->cmd_slot[0].status = 0;
- pp->cmd_slot[0].tbl_addr = cpu_to_le32(pp->cmd_tbl_dma & 0xffffffff);
- pp->cmd_slot[0].tbl_addr_hi = cpu_to_le32((pp->cmd_tbl_dma >> 16) >> 16);
+ /*
+ * the tag determines the offset into the allocated cmd table
+ */
+ offset = qc->tag * AHCI_CMD_TOTAL;
+
+ cmd_tbl_dma = pp->cmd_tbl_dma + offset;
+
+ pp->cmd_slot[qc->tag].opts = cpu_to_le32(opts);
+ pp->cmd_slot[qc->tag].status = 0;
+ pp->cmd_slot[qc->tag].tbl_addr = cpu_to_le32(cmd_tbl_dma & 0xffffffff);
+ pp->cmd_slot[qc->tag].tbl_addr_hi = cpu_to_le32((cmd_tbl_dma >> 16) >> 16);

/*
* Fill in command table information. First, the header,
* a SATA Register - Host to Device command FIS.
*/
- ata_tf_to_fis(&qc->tf, pp->cmd_tbl, 0);
+ ata_tf_to_fis(&qc->tf, pp->cmd_tbl + offset, 0);

if (!(qc->flags & ATA_QCFLAG_DMAMAP))
return;

- ahci_fill_sg(qc);
+ ahci_fill_sg(qc, offset);
}

static void ahci_intr_error(struct ata_port *ap, u32 irq_stat)
@@ -593,7 +613,70 @@
printk(KERN_WARNING "ata%u: error occurred, port reset\n", ap->id);
}

-static void ahci_eng_timeout(struct ata_port *ap)
+static void dump_log_page(unsigned char *p)
+{
+ int i;
+
+ printk("LOG 0x10: nq=%d, tag=%d\n", p[0] >> 7, p[0] & 0x1f);
+
+ for (i = 2; i < 14; i++)
+ printk("%d:%d ", i, p[i]);
+
+ printk("\n");
+}
+
+static void ahci_host_ncq_intr_err(struct ata_port *ap)
+{
+ struct ahci_port_priv *pp = ap->private_data;
+ void *mmio = ap->host_set->mmio_base;
+ void *port_mmio = ahci_port_base(mmio, ap->port_no);
+ unsigned long flags;
+ char *buffer;
+
+ printk(KERN_ERR "ata%u: ncq interrupt error\n", ap->id);
+
+ /*
+ * error all io first
+ */
+ spin_lock_irqsave(&ap->host_set->lock, flags);
+
+ while (pp->sactive) {
+ struct ata_queued_cmd *qc;
+ int tag = ffs(pp->sactive) - 1;
+
+ pp->sactive &= ~(1 << tag);
+ qc = ata_qc_from_tag(ap, tag);
+ if (qc)
+ ata_qc_complete(qc, ATA_ERR);
+ }
+
+ spin_unlock_irqrestore(&ap->host_set->lock, flags);
+
+ ahci_intr_error(ap, readl(port_mmio + PORT_IRQ_STAT));
+
+ buffer = kmalloc(512, GFP_KERNEL);
+ if (!buffer) {
+ printk(KERN_ERR "ata%u: unable to allocate memory for error\n", ap->id);
+ return;
+ }
+
+ if (ata_read_log_page(ap, 0, 0x10, buffer, 1))
+ printk(KERN_ERR "ata%u: unable to read log page\n", ap->id);
+ else
+ dump_log_page(buffer);
+
+ kfree(buffer);
+}
+
+/*
+ * TODO: needs to use READ_LOG_EXT/page=10h to retrieve error information
+ */
+static void ahci_ncq_timeout(struct ata_port *ap)
+{
+ ahci_host_ncq_intr_err(ap);
+}
+
+static void ahci_nonncq_timeout(struct ata_port *ap)
{
void *mmio = ap->host_set->mmio_base;
void *port_mmio = ahci_port_base(mmio, ap->port_no);
@@ -617,13 +700,65 @@
qc->scsidone = scsi_finish_command;
ata_qc_complete(qc, ATA_ERR);
}
+}

+static void ahci_eng_timeout(struct ata_port *ap)
+{
+ if (ap->ncq_depth)
+ ahci_ncq_timeout(ap);
+ else
+ ahci_nonncq_timeout(ap);
}

-static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
+static inline void ahci_host_ncq_intr(struct ata_port *ap, u32 status)
{
+ struct ahci_port_priv *pp = ap->private_data;
void *mmio = ap->host_set->mmio_base;
void *port_mmio = ahci_port_base(mmio, ap->port_no);
+ u32 tags, sactive;
+
+ if (status & PORT_IRQ_D2H_REG_FIS) {
+ u8 *d2h_fis = pp->rx_fis + RX_FIS_D2H_REG;
+
+ /*
+ * pre-BSY clear error, let timeout error handling take care
+ * of it when it kicks in
+ */
+ if (d2h_fis[2] & ATA_ERR)
+ return;
+ } else if (status & PORT_IRQ_SDB_FIS) {
+ u8 *sdb_fis = pp->rx_fis + RX_FIS_SDB_REG;
+
+ if (sdb_fis[1] & ATA_ERR)
+ return;
+ }
+
+ /*
+ * SActive will have the bits cleared for completed commands
+ */
+ sactive = readl(port_mmio + PORT_SCR_ACT);
+ tags = pp->sactive & ~sactive;
+
+ while (tags) {
+ struct ata_queued_cmd *qc;
+ int tag = ffs(tags) - 1;
+
+ tags &= ~(1 << tag);
+ qc = ata_qc_from_tag(ap, tag);
+ if (qc)
+ ata_qc_complete(qc, 0);
+ else
+ printk(KERN_ERR "ahci: missing tag %d\n", tag);
+ }
+
+ pp->sactive = sactive;
+}
+
+static inline int ahci_host_intr(struct ata_port *ap)
+{
+ void *mmio = ap->host_set->mmio_base;
+ void *port_mmio = ahci_port_base(mmio, ap->port_no);
+ struct ata_queued_cmd *qc;
u32 status, serr, ci;

serr = readl(port_mmio + PORT_SCR_ERR);
@@ -632,18 +767,27 @@
status = readl(port_mmio + PORT_IRQ_STAT);
writel(status, port_mmio + PORT_IRQ_STAT);

- ci = readl(port_mmio + PORT_CMD_ISSUE);
- if (likely((ci & 0x1) == 0)) {
- if (qc) {
- ata_qc_complete(qc, 0);
- qc = NULL;
+ if (ap->ncq_depth) {
+ if (!serr)
+ ahci_host_ncq_intr(ap, status);
+ else
+ ahci_host_ncq_intr_err(ap);
+ } else {
+ ci = readl(port_mmio + PORT_CMD_ISSUE);
+ if (likely((ci & 0x1) == 0)) {
+ qc = ata_qc_from_tag(ap, ap->active_tag);
+ if (qc) {
+ ata_qc_complete(qc, 0);
+ qc = NULL;
+ }
}
- }

- if (status & PORT_IRQ_FATAL) {
- ahci_intr_error(ap, status);
- if (qc)
- ata_qc_complete(qc, ATA_ERR);
+ if (status & PORT_IRQ_FATAL) {
+ ahci_intr_error(ap, status);
+ qc = ata_qc_from_tag(ap, ap->active_tag);
+ if (qc)
+ ata_qc_complete(qc, ATA_ERR);
+ }
}

return 1;
@@ -683,9 +827,7 @@
ap = host_set->ports[i];
tmp = irq_stat & (1 << i);
if (tmp && ap) {
- struct ata_queued_cmd *qc;
- qc = ata_qc_from_tag(ap, ap->active_tag);
- if (ahci_host_intr(ap, qc))
+ if (ahci_host_intr(ap))
irq_ack |= (1 << i);
}
}
@@ -707,10 +849,7 @@
struct ata_port *ap = qc->ap;
void *port_mmio = (void *) ap->ioaddr.cmd_addr;

- writel(1, port_mmio + PORT_SCR_ACT);
- readl(port_mmio + PORT_SCR_ACT); /* flush */
-
- writel(1, port_mmio + PORT_CMD_ISSUE);
+ writel(1 << qc->tag, port_mmio + PORT_CMD_ISSUE);
readl(port_mmio + PORT_CMD_ISSUE); /* flush */

return 0;
@@ -1027,6 +1166,9 @@
if (rc)
goto err_out_hpriv;

+ if (hpriv->cap & HOST_CAP_NCQ)
+ probe_ent->host_flags |= ATA_FLAG_NCQ;
+
ahci_print_info(probe_ent);

/* FIXME: check ata_device_add return value */
Index: drivers/scsi/libata-core.c
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-core.c (mode:100644)
+++ uncommitted/drivers/scsi/libata-core.c (mode:100644)
@@ -519,7 +519,7 @@
* LOCKING:
* None.
*/
-static int ata_prot_to_cmd(int protocol, int lba48)
+static int ata_prot_to_cmd(int protocol, int lba48, int ncq)
{
int rcmd = 0, wcmd = 0;

@@ -535,7 +535,10 @@
break;

case ATA_PROT_DMA:
- if (lba48) {
+ if (ncq) {
+ rcmd = ATA_CMD_FPDMA_READ;
+ wcmd = ATA_CMD_FPDMA_WRITE;
+ } else if (lba48) {
rcmd = ATA_CMD_READ_EXT;
wcmd = ATA_CMD_WRITE_EXT;
} else {
@@ -568,6 +571,7 @@
{
int pio = (dev->flags & ATA_DFLAG_PIO);
int lba48 = (dev->flags & ATA_DFLAG_LBA48);
+ int ncq = (dev->flags & ATA_DFLAG_NCQ);
int proto, cmd;

if (pio)
@@ -575,7 +579,7 @@
else
proto = dev->xfer_protocol = ATA_PROT_DMA;

- cmd = ata_prot_to_cmd(proto, lba48);
+ cmd = ata_prot_to_cmd(proto, lba48, ncq);
if (cmd < 0)
BUG();

@@ -1139,6 +1143,10 @@
goto err_out_nosup;
}

+ if (ap->flags & ATA_FLAG_NCQ)
+ if (ata_id_queue_depth(dev->id) > 1)
+ dev->flags |= ATA_DFLAG_NCQ;
+
if (ata_id_has_lba48(dev->id)) {
dev->flags |= ATA_DFLAG_LBA48;
dev->n_sectors = ata_id_u64(dev->id, 100);
@@ -1149,11 +1157,12 @@
ap->host->max_cmd_len = 16;

/* print device info to dmesg */
- printk(KERN_INFO "ata%u: dev %u ATA, max %s, %Lu sectors:%s\n",
+ printk(KERN_INFO "ata%u: dev %u ATA, max %s, %Lu sectors:%s %s\n",
ap->id, device,
ata_mode_string(xfer_modes),
(unsigned long long)dev->n_sectors,
- dev->flags & ATA_DFLAG_LBA48 ? " lba48" : "");
+ dev->flags & ATA_DFLAG_LBA48 ? " lba48" : "",
+ dev->flags & ATA_DFLAG_NCQ ? "(NCQ)" : "");
}

/* ATAPI-specific feature tests */
@@ -1187,6 +1196,74 @@
}

/**
+ * ata_read_log_page - read a specific log page
+ * @ap: port on which device we wish to probe resides
+ * @device: device bus address, starting at zero
+ * @page: page to read
+ * @buffer: where to store the read data
+ * @sectors: how much data to read
+ *
+ * After reading the device information page, we use several
+ * bits of information from it to initialize data structures
+ * that will be used during the lifetime of the ata_device.
+ * Other data from the info page is used to disqualify certain
+ * older ATA devices we do not wish to support.
+ *
+ * LOCKING:
+ * Grabs host_set lock.
+ */
+
+int ata_read_log_page(struct ata_port *ap, unsigned int device, char page,
+ char *buffer, unsigned int sectors)
+{
+ struct ata_device *dev = &ap->device[device];
+ DECLARE_COMPLETION(wait);
+ struct ata_queued_cmd *qc;
+ unsigned long flags;
+ u8 status;
+ int rc;
+
+ assert(dev->class == ATA_DEV_ATA);
+
+ ata_dev_select(ap, device, 1, 1);
+
+ qc = ata_qc_new_init(ap, dev);
+ BUG_ON(qc == NULL);
+
+ ata_sg_init_one(qc, buffer, sectors << 9);
+ qc->dma_dir = DMA_FROM_DEVICE;
+ qc->tf.protocol = ATA_PROT_PIO;
+ qc->nsect = sectors;
+
+ qc->tf.command = ATA_CMD_READ_LOG_EXT;
+ qc->tf.nsect = sectors;
+ qc->tf.hob_nsect = sectors >> 8;
+ qc->tf.lbal = page;
+
+ qc->waiting = &wait;
+ qc->complete_fn = ata_qc_complete_noop;
+
+ printk("RLP issue\n");
+ spin_lock_irqsave(&ap->host_set->lock, flags);
+ rc = ata_qc_issue(qc);
+ spin_unlock_irqrestore(&ap->host_set->lock, flags);
+ printk("RLP issue done\n");
+
+ if (rc)
+ return -EIO;
+
+ wait_for_completion(&wait);
+
+ printk("RLP wait done\n");
+
+ status = ata_chk_status(ap);
+ if (status & (ATA_ERR | ATA_ABORTED))
+ return -EIO;
+
+ return 0;
+}
+
+/**
* ata_bus_probe - Reset and probe ATA bus
* @ap: Bus to probe
*
@@ -2753,6 +2830,16 @@
struct ata_port *ap = qc->ap;
unsigned int tag, do_clear = 0;

+ if (likely(qc->flags & ATA_QCFLAG_ACCOUNT)) {
+ if (qc->flags & ATA_QCFLAG_NCQ) {
+ assert(ap->ncq_depth);
+ ap->ncq_depth--;
+ } else {
+ assert(ap->depth);
+ ap->depth--;
+ }
+ }
+
qc->flags = 0;
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
@@ -2770,6 +2857,8 @@

if (likely(do_clear))
clear_bit(tag, &ap->qactive);
+ if (ap->cmd_waiters)
+ wake_up(&ap->cmd_wait_queue);
}

/**
@@ -2848,6 +2937,52 @@
/* never reached */
}

+/*
+ * NCQ and non-NCQ commands are mutually exclusive. So we have to deny a
+ * request to queue a non-NCQ command, if we have NCQ commands in flight (and
+ * vice versa).
+ */
+static inline int ata_qc_issue_ok(struct ata_port *ap,
+ struct ata_queued_cmd *qc, int waiting)
+{
+ /*
+ * if people are already waiting for a queue drain, don't allow a
+ * new 'lucky' queuer to get in there
+ */
+ if (ap->cmd_waiters > waiting)
+ return 0;
+ if (qc->flags & ATA_QCFLAG_NCQ)
+ return !ap->depth;
+
+ return !(ap->ncq_depth + ap->depth);
+}
+
+static void ata_qc_issue_wait(struct ata_port *ap, struct ata_queued_cmd *qc)
+{
+ DEFINE_WAIT(wait);
+
+ ap->cmd_waiters++;
+
+ do {
+ /*
+ * we rely on the FIFO order of the exclusive waitqueues
+ */
+ prepare_to_wait_exclusive(&ap->cmd_wait_queue, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!ata_qc_issue_ok(ap, qc, 1)) {
+ spin_unlock_irq(&ap->host_set->lock);
+ schedule();
+ spin_lock_irq(&ap->host_set->lock);
+ }
+
+ finish_wait(&ap->cmd_wait_queue, &wait);
+
+ } while (!ata_qc_issue_ok(ap, qc, 1));
+
+ ap->cmd_waiters--;
+}
+
/**
* ata_qc_issue - issue taskfile to device
* @qc: command to issue to device
@@ -2867,6 +3002,21 @@
int ata_qc_issue(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
+ int rc;
+
+ /*
+ * see if we can queue one more command at this point in time, see
+ * comment at ata_qc_issue_ok(). NCQ commands typically originate from
+ * the SCSI layer, we can ask the mid layer to defer those commands
+ * similar to a QUEUE_FULL condition on a 'real' SCSI device
+ */
+ if (!ata_qc_issue_ok(ap, qc, 0)) {
+ if (qc->flags & ATA_QCFLAG_DEFER)
+ return ATA_QC_ISSUE_DEFER;
+
+ ata_qc_issue_wait(ap, qc);
+ assert(ata_qc_issue_ok(ap, qc, 0));
+ }

if (ata_should_dma_map(qc)) {
if (qc->flags & ATA_QCFLAG_SG) {
@@ -2883,12 +3033,24 @@
ap->ops->qc_prep(qc);

qc->ap->active_tag = qc->tag;
- qc->flags |= ATA_QCFLAG_ACTIVE;
+ qc->flags |= (ATA_QCFLAG_ACTIVE | ATA_QCFLAG_ACCOUNT);
+
+ rc = ap->ops->qc_issue(qc);
+ if (rc != ATA_QC_ISSUE_OK)
+ return rc;

- return ap->ops->qc_issue(qc);
+ if (qc->flags & ATA_QCFLAG_NCQ) {
+ assert(ap->ncq_depth < ATA_MAX_QUEUE)
+ ap->ncq_depth++;
+ } else {
+ assert(!ap->depth);
+ ap->depth++;
+ }

+ return ATA_QC_ISSUE_OK;
err_out:
- return -1;
+ ata_qc_free(qc);
+ return ATA_QC_ISSUE_FATAL;
}

/**
@@ -2950,7 +3112,8 @@

default:
WARN_ON(1);
- return -1;
+ ata_qc_free(qc);
+ return ATA_QC_ISSUE_FATAL;
}

return 0;
@@ -3382,6 +3545,8 @@
ap->ops = ent->port_ops;
ap->cbl = ATA_CBL_NONE;
ap->active_tag = ATA_TAG_POISON;
+ init_waitqueue_head(&ap->cmd_wait_queue);
+ ap->cmd_waiters = 0;
ap->last_ctl = 0xFF;

INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
@@ -4017,6 +4182,7 @@
EXPORT_SYMBOL_GPL(ata_dev_classify);
EXPORT_SYMBOL_GPL(ata_dev_id_string);
EXPORT_SYMBOL_GPL(ata_scsi_simulate);
+EXPORT_SYMBOL_GPL(ata_read_log_page);

#ifdef CONFIG_PCI
EXPORT_SYMBOL_GPL(pci_test_config_bits);
Index: drivers/scsi/libata-scsi.c
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-scsi.c (mode:100644)
+++ uncommitted/drivers/scsi/libata-scsi.c (mode:100644)
@@ -336,6 +336,7 @@
if (sdev->id < ATA_MAX_DEVICES) {
struct ata_port *ap;
struct ata_device *dev;
+ int depth;

ap = (struct ata_port *) &sdev->host->hostdata[0];
dev = &ap->device[sdev->id];
@@ -353,6 +354,13 @@
*/
blk_queue_max_sectors(sdev->request_queue, 2048);
}
+
+ if (dev->flags & ATA_DFLAG_NCQ) {
+ int ddepth = ata_id_queue_depth(dev->id) + 1;
+
+ depth = min(sdev->host->can_queue, ddepth);
+ scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG, depth);
+ }
}

return 0; /* scsi layer doesn't check return value, sigh */
@@ -537,6 +545,7 @@
{
struct ata_taskfile *tf = &qc->tf;
unsigned int lba48 = tf->flags & ATA_TFLAG_LBA48;
+ unsigned int ncq = qc->dev->flags & ATA_DFLAG_NCQ;

tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
tf->protocol = qc->dev->xfer_protocol;
@@ -550,8 +559,18 @@
tf->flags |= ATA_TFLAG_WRITE;
}

+ if (ncq)
+ qc->flags |= ATA_QCFLAG_NCQ;
+
if (scsicmd[0] == READ_10 || scsicmd[0] == WRITE_10) {
- if (lba48) {
+ if (ncq) {
+ tf->hob_feature = scsicmd[7];
+ tf->feature = scsicmd[8];
+ tf->nsect = qc->tag << 3;
+ tf->hob_lbal = scsicmd[2];
+ qc->nsect = ((unsigned int)scsicmd[7] << 8) |
+ scsicmd[8];
+ } else if (lba48) {
tf->hob_nsect = scsicmd[7];
tf->hob_lbal = scsicmd[2];

@@ -569,7 +588,8 @@
qc->nsect = scsicmd[8];
}

- tf->nsect = scsicmd[8];
+ if (!ncq)
+ tf->nsect = scsicmd[8];
tf->lbal = scsicmd[5];
tf->lbam = scsicmd[4];
tf->lbah = scsicmd[3];
@@ -579,7 +599,14 @@
}

if (scsicmd[0] == READ_6 || scsicmd[0] == WRITE_6) {
- qc->nsect = tf->nsect = scsicmd[4];
+ qc->nsect = scsicmd[4];
+
+ if (ncq) {
+ tf->nsect = qc->tag << 3;
+ tf->feature = scsicmd[4];
+ } else
+ tf->nsect = scsicmd[4];
+
tf->lbal = scsicmd[3];
tf->lbam = scsicmd[2];
tf->lbah = scsicmd[1] & 0x1f; /* mask out reserved bits */
@@ -593,7 +620,16 @@
if (scsicmd[2] || scsicmd[3] || scsicmd[10] || scsicmd[11])
return 1;

- if (lba48) {
+ if (ncq) {
+ tf->hob_feature = scsicmd[13];
+ tf->feature = scsicmd[12];
+ tf->nsect = qc->tag << 3;
+ tf->hob_lbal = scsicmd[6];
+ tf->hob_lbam = scsicmd[5];
+ tf->hob_lbah = scsicmd[4];
+ qc->nsect = ((unsigned int)scsicmd[12] << 8) |
+ scsicmd[13];
+ } else if (lba48) {
tf->hob_nsect = scsicmd[12];
tf->hob_lbal = scsicmd[6];
tf->hob_lbam = scsicmd[5];
@@ -613,7 +649,8 @@
qc->nsect = scsicmd[13];
}

- tf->nsect = scsicmd[13];
+ if (!ncq)
+ tf->nsect = scsicmd[13];
tf->lbal = scsicmd[9];
tf->lbam = scsicmd[8];
tf->lbah = scsicmd[7];
@@ -666,6 +703,7 @@
{
struct ata_queued_cmd *qc;
u8 *scsicmd = cmd->cmnd;
+ int ret;

VPRINTK("ENTER\n");

@@ -696,9 +734,18 @@
if (xlat_func(qc, scsicmd))
goto err_out;

+ qc->flags |= ATA_QCFLAG_DEFER;
+
/* select device, send command to hardware */
- if (ata_qc_issue(qc))
+ ret = ata_qc_issue(qc);
+ if (ret == ATA_QC_ISSUE_FATAL)
goto err_out;
+ else if (ret == ATA_QC_ISSUE_DEFER) {
+ VPRINTK("DEFER\n");
+ ata_qc_free(qc);
+ cmd->result = (DID_OK << 16) | (QUEUE_FULL << 1);
+ done(cmd);
+ }

VPRINTK("EXIT\n");
return;
Index: drivers/scsi/scsi_lib.c
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/scsi_lib.c (mode:100644)
+++ uncommitted/drivers/scsi/scsi_lib.c (mode:100644)
@@ -1292,13 +1292,17 @@
shost = sdev->host;
while (!blk_queue_plugged(q)) {
int rtn;
+
+ if (!scsi_dev_queue_ready(q, sdev))
+ break;
+
/*
* get next queueable request. We do this early to make sure
* that the request is fully prepared even if we cannot
* accept it.
*/
req = elv_next_request(q);
- if (!req || !scsi_dev_queue_ready(q, sdev))
+ if (!req)
break;

if (unlikely(!scsi_device_online(sdev))) {
Index: drivers/scsi/scsi_sysfs.c
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/scsi_sysfs.c (mode:100644)
+++ uncommitted/drivers/scsi/scsi_sysfs.c (mode:100644)
@@ -309,12 +309,35 @@
* Create the actual show/store functions and data structures.
*/
sdev_rd_attr (device_blocked, "%d\n");
-sdev_rd_attr (queue_depth, "%d\n");
sdev_rd_attr (type, "%d\n");
sdev_rd_attr (scsi_level, "%d\n");
sdev_rd_attr (vendor, "%.8s\n");
sdev_rd_attr (model, "%.16s\n");
sdev_rd_attr (rev, "%.4s\n");
+sdev_rd_attr (device_busy, "%d\n");
+
+static ssize_t
+sdev_show_queue_depth(struct device *dev, char *buf)
+{
+ struct scsi_device *sdev = to_scsi_device(dev);
+
+ return snprintf (buf, 20, "%d\n", sdev->queue_depth);
+}
+
+static ssize_t
+sdev_store_queue_depth(struct device *dev, const char *buf, size_t count)
+{
+ struct scsi_device *sdev = to_scsi_device(dev);
+ int depth;
+
+ sscanf(buf, "%d\n", &depth);
+
+ if (depth <= sdev->host->can_queue)
+ scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), depth);
+
+ return count;
+}
+static DEVICE_ATTR(queue_depth, S_IRUGO | S_IWUSR, sdev_show_queue_depth, sdev_store_queue_depth);

static ssize_t
sdev_show_timeout (struct device *dev, char *buf)
@@ -446,6 +469,7 @@
&dev_attr_iorequest_cnt,
&dev_attr_iodone_cnt,
&dev_attr_ioerr_cnt,
+ &dev_attr_device_busy,
NULL
};

Index: include/linux/ata.h
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/include/linux/ata.h (mode:100644)
+++ uncommitted/include/linux/ata.h (mode:100644)
@@ -79,7 +79,8 @@
ATA_NIEN = (1 << 1), /* disable-irq flag */
ATA_LBA = (1 << 6), /* LBA28 selector */
ATA_DEV1 = (1 << 4), /* Select Device 1 (slave) */
- ATA_DEVICE_OBS = (1 << 7) | (1 << 5), /* obs bits in dev reg */
+ ATA_DEVICE_OBS = (1 << 5), /* obs bits in dev reg */
+ ATA_FPDMA_FUA = (1 << 7), /* FUA cache bypass bit */
ATA_DEVCTL_OBS = (1 << 3), /* obsolete bit in devctl reg */
ATA_BUSY = (1 << 7), /* BSY status bit */
ATA_DRDY = (1 << 6), /* device ready */
@@ -125,6 +126,12 @@
ATA_CMD_PACKET = 0xA0,
ATA_CMD_VERIFY = 0x40,
ATA_CMD_VERIFY_EXT = 0x42,
+ ATA_CMD_FPDMA_READ = 0x60,
+ ATA_CMD_FPDMA_WRITE = 0x61,
+ ATA_CMD_READ_LOG_EXT = 0x2f,
+
+ /* READ_LOG_EXT pages */
+ READ_LOG_SATA_NCQ_PAGE = 0x10,

/* SETFEATURES stuff */
SETFEATURES_XFER = 0x03,
@@ -234,6 +241,7 @@
#define ata_id_has_lba(id) ((id)[49] & (1 << 9))
#define ata_id_has_dma(id) ((id)[49] & (1 << 8))
#define ata_id_removeable(id) ((id)[0] & (1 << 7))
+#define ata_id_queue_depth(id) ((id)[75] & 0x1f)
#define ata_id_u32(id,n) \
(((u32) (id)[(n) + 1] << 16) | ((u32) (id)[(n)]))
#define ata_id_u64(id,n) \
Index: include/linux/libata.h
===================================================================
--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/include/linux/libata.h (mode:100644)
+++ uncommitted/include/linux/libata.h (mode:100644)
@@ -80,7 +80,7 @@
LIBATA_MAX_PRD = ATA_MAX_PRD / 2,
ATA_MAX_PORTS = 8,
ATA_DEF_QUEUE = 1,
- ATA_MAX_QUEUE = 1,
+ ATA_MAX_QUEUE = 32,
ATA_MAX_SECTORS = 200, /* FIXME */
ATA_MAX_BUS = 2,
ATA_DEF_BUSY_WAIT = 10000,
@@ -95,6 +95,7 @@
ATA_DFLAG_LBA48 = (1 << 0), /* device supports LBA48 */
ATA_DFLAG_PIO = (1 << 1), /* device currently in PIO mode */
ATA_DFLAG_LOCK_SECTORS = (1 << 2), /* don't adjust max_sectors */
+ ATA_DFLAG_NCQ = (1 << 3), /* Device can do NCQ */

ATA_DEV_UNKNOWN = 0, /* unknown device */
ATA_DEV_ATA = 1, /* ATA device */
@@ -113,11 +114,19 @@
ATA_FLAG_MMIO = (1 << 6), /* use MMIO, not PIO */
ATA_FLAG_SATA_RESET = (1 << 7), /* use COMRESET */
ATA_FLAG_PIO_DMA = (1 << 8), /* PIO cmds via DMA */
+ ATA_FLAG_NCQ = (1 << 9), /* Can do NCQ */

ATA_QCFLAG_ACTIVE = (1 << 1), /* cmd not yet ack'd to scsi lyer */
ATA_QCFLAG_SG = (1 << 3), /* have s/g table? */
ATA_QCFLAG_SINGLE = (1 << 4), /* no s/g, just a single buffer */
ATA_QCFLAG_DMAMAP = ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
+ ATA_QCFLAG_NCQ = (1 << 5), /* using NCQ */
+ ATA_QCFLAG_ACCOUNT = (1 << 6), /* depth accounted */
+ ATA_QCFLAG_DEFER = (1 << 7), /* ok to defer */
+
+ ATA_QC_ISSUE_OK = 0,
+ ATA_QC_ISSUE_DEFER = 1,
+ ATA_QC_ISSUE_FATAL = 2,

/* various lengths of time */
ATA_TMOUT_EDD = 5 * HZ, /* hueristic */
@@ -308,6 +317,11 @@
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
unsigned long qactive;
unsigned int active_tag;
+ int ncq_depth;
+ int depth;
+
+ wait_queue_head_t cmd_wait_queue;
+ unsigned int cmd_waiters;

struct ata_host_stats stats;
struct ata_host_set *host_set;
@@ -433,6 +447,8 @@
struct block_device *bdev,
sector_t capacity, int geom[]);
extern int ata_scsi_slave_config(struct scsi_device *sdev);
+extern int ata_read_log_page(struct ata_port *, unsigned int, char, char *,
+ unsigned int);


#ifdef CONFIG_PCI

--
Jens Axboe


2005-05-26 16:30:23

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> Hi,
>
> I was curious how well NCQ works on recent SATA disks (and controllers),
> so I hacked in some basic support for NCQ in libata and ahci. First of
> all, ahci is a really nice architecture, it was a joy to work with. NCQ
> likewise, a pleasure to finally see some sane queueing facility for
> SATA/ide!
>
> Results are pretty damn nice, I easily get 30-50% faster random io read
> performance without having to try hard.
>
> Now, this patch is not complete. It should work and work well, but error
> handling isn't really tested or finished yet (by any stretch of the
> imagination). So don't use this on a production machine, it _should_ be
> safe to use on your test boxes and for-fun workstations though (I run it
> here...). I have tested on ich6 and ich7 generation ahci, and with
> Maxtor and Seagate drives.
>
> YMMV. If you try it, do let me know.
>
> Index: drivers/scsi/ahci.c
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/ahci.c (mode:100644)
> +++ uncommitted/drivers/scsi/ahci.c (mode:100644)
> @@ -19,8 +19,8 @@
> * If you do not delete the provisions above, a recipient may use your
> * version of this file under either the OSL or the GPL.
> *
> - * Version 1.0 of the AHCI specification:
> - * http://www.intel.com/technology/serialata/pdf/rev1_0.pdf
> + * Version 1.1 of the AHCI specification:
> + * http://www.intel.com/technology/serialata/pdf/rev1_1.pdf
> *
> */
>
> @@ -46,10 +46,13 @@
> AHCI_MAX_SG = 168, /* hardware max is 64K */
> AHCI_DMA_BOUNDARY = 0xffffffff,
> AHCI_USE_CLUSTERING = 0,
> - AHCI_CMD_SLOT_SZ = 32 * 32,
> - AHCI_RX_FIS_SZ = 256,
> + AHCI_MAX_CMDS = 32,
> + AHCI_CMD_SZ = 32,
> AHCI_CMD_TBL_HDR = 0x80,
> - AHCI_CMD_TBL_SZ = AHCI_CMD_TBL_HDR + (AHCI_MAX_SG * 16),
> + AHCI_CMD_SLOT_SZ = AHCI_MAX_CMDS * AHCI_CMD_SZ,
> + AHCI_RX_FIS_SZ = 256,
> + AHCI_CMD_TOTAL = AHCI_CMD_TBL_HDR + (AHCI_MAX_SG * 16),
> + AHCI_CMD_TBL_SZ = AHCI_MAX_CMDS * AHCI_CMD_TOTAL,
> AHCI_PORT_PRIV_DMA_SZ = AHCI_CMD_SLOT_SZ + AHCI_CMD_TBL_SZ +
> AHCI_RX_FIS_SZ,
> AHCI_IRQ_ON_SG = (1 << 31),
> @@ -57,6 +60,7 @@
> AHCI_CMD_WRITE = (1 << 6),
>
> RX_FIS_D2H_REG = 0x40, /* offset of D2H Register FIS data */
> + RX_FIS_SDB_REG = 0x58, /* offset of SDB Register FIS data */
>
> board_ahci = 0,
>
> @@ -74,6 +78,7 @@
>
> /* HOST_CAP bits */
> HOST_CAP_64 = (1 << 31), /* PCI DAC (64-bit DMA) support */
> + HOST_CAP_NCQ = (1 << 30), /* Native Command Queueing */
>
> /* registers for each SATA port */
> PORT_LST_ADDR = 0x00, /* command list DMA addr */
> @@ -161,9 +166,9 @@
> dma_addr_t cmd_slot_dma;
> void *cmd_tbl;
> dma_addr_t cmd_tbl_dma;
> - struct ahci_sg *cmd_tbl_sg;
> void *rx_fis;
> dma_addr_t rx_fis_dma;
> + u32 sactive;
> };
>
> static u32 ahci_scr_read (struct ata_port *ap, unsigned int sc_reg);
> @@ -181,7 +186,7 @@
> static void ahci_qc_prep(struct ata_queued_cmd *qc);
> static u8 ahci_check_status(struct ata_port *ap);
> static u8 ahci_check_err(struct ata_port *ap);
> -static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc);
> +static inline int ahci_host_intr(struct ata_port *ap);
>
> static Scsi_Host_Template ahci_sht = {
> .module = THIS_MODULE,
> @@ -189,7 +194,7 @@
> .ioctl = ata_scsi_ioctl,
> .queuecommand = ata_scsi_queuecmd,
> .eh_strategy_handler = ata_scsi_error,
> - .can_queue = ATA_DEF_QUEUE,
> + .can_queue = ATA_MAX_QUEUE,
> .this_id = ATA_SHT_THIS_ID,
> .sg_tablesize = AHCI_MAX_SG,
> .max_sectors = ATA_MAX_SECTORS,
> @@ -200,7 +205,7 @@
> .dma_boundary = AHCI_DMA_BOUNDARY,
> .slave_configure = ata_scsi_slave_config,
> .bios_param = ata_std_bios_param,
> - .ordered_flush = 1,
> + .ordered_flush = 0, /* conflicts with NCQ for now */
> };
>
> static struct ata_port_operations ahci_ops = {
> @@ -339,14 +344,11 @@
> mem_dma += AHCI_RX_FIS_SZ;
>
> /*
> - * Third item: data area for storing a single command
> - * and its scatter-gather table
> + * Third item: data area for storing commands and their sg tables
> */
> pp->cmd_tbl = mem;
> pp->cmd_tbl_dma = mem_dma;
>
> - pp->cmd_tbl_sg = mem + AHCI_CMD_TBL_HDR;
> -
> ap->private_data = pp;
>
> if (hpriv->cap & HOST_CAP_64)
> @@ -478,9 +480,10 @@
> ata_tf_from_fis(d2h_fis, tf);
> }
>
> -static void ahci_fill_sg(struct ata_queued_cmd *qc)
> +static void ahci_fill_sg(struct ata_queued_cmd *qc, int offset)
> {
> struct ahci_port_priv *pp = qc->ap->private_data;
> + struct ahci_sg *cmd_tbl_sg;
> unsigned int i;
>
> VPRINTK("ENTER\n");
> @@ -488,6 +491,7 @@
> /*
> * Next, the S/G list.
> */
> + cmd_tbl_sg = pp->cmd_tbl + offset + AHCI_CMD_TBL_HDR;
> for (i = 0; i < qc->n_elem; i++) {
> u32 sg_len;
> dma_addr_t addr;
> @@ -495,21 +499,30 @@
> addr = sg_dma_address(&qc->sg[i]);
> sg_len = sg_dma_len(&qc->sg[i]);
>
> - pp->cmd_tbl_sg[i].addr = cpu_to_le32(addr & 0xffffffff);
> - pp->cmd_tbl_sg[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
> - pp->cmd_tbl_sg[i].flags_size = cpu_to_le32(sg_len - 1);
> + cmd_tbl_sg[i].addr = cpu_to_le32(addr & 0xffffffff);
> + cmd_tbl_sg[i].addr_hi = cpu_to_le32((addr >> 16) >> 16);
> + cmd_tbl_sg[i].flags_size = cpu_to_le32(sg_len - 1);
> }
> }
>
> static void ahci_qc_prep(struct ata_queued_cmd *qc)
> {
> struct ahci_port_priv *pp = qc->ap->private_data;
> - u32 opts;
> + void *port_mmio = (void *) qc->ap->ioaddr.cmd_addr;
> const u32 cmd_fis_len = 5; /* five dwords */
> + dma_addr_t cmd_tbl_dma;
> + u32 opts;
> + int offset;
> +
> + if (qc->flags & ATA_QCFLAG_NCQ) {
> + pp->sactive |= (1 << qc->tag);
> +
> + writel(1 << qc->tag, port_mmio + PORT_SCR_ACT);
> + readl(port_mmio + PORT_SCR_ACT); /* flush */
> + }

Wrong, you should do this in ahci_qc_issue not here.


> /*
> - * Fill in command slot information (currently only one slot,
> - * slot 0, is currently since we don't do queueing)
> + * Fill in command slot information
> */
>
> opts = (qc->n_elem << 16) | cmd_fis_len;
> @@ -528,21 +541,28 @@
> break;
> }
>
> - pp->cmd_slot[0].opts = cpu_to_le32(opts);
> - pp->cmd_slot[0].status = 0;
> - pp->cmd_slot[0].tbl_addr = cpu_to_le32(pp->cmd_tbl_dma & 0xffffffff);
> - pp->cmd_slot[0].tbl_addr_hi = cpu_to_le32((pp->cmd_tbl_dma >> 16) >> 16);
> + /*
> + * the tag determines the offset into the allocated cmd table
> + */
> + offset = qc->tag * AHCI_CMD_TOTAL;
> +
> + cmd_tbl_dma = pp->cmd_tbl_dma + offset;
> +
> + pp->cmd_slot[qc->tag].opts = cpu_to_le32(opts);
> + pp->cmd_slot[qc->tag].status = 0;
> + pp->cmd_slot[qc->tag].tbl_addr = cpu_to_le32(cmd_tbl_dma & 0xffffffff);
> + pp->cmd_slot[qc->tag].tbl_addr_hi = cpu_to_le32((cmd_tbl_dma >> 16) >> 16);
>
> /*
> * Fill in command table information. First, the header,
> * a SATA Register - Host to Device command FIS.
> */
> - ata_tf_to_fis(&qc->tf, pp->cmd_tbl, 0);
> + ata_tf_to_fis(&qc->tf, pp->cmd_tbl + offset, 0);
>
> if (!(qc->flags & ATA_QCFLAG_DMAMAP))
> return;
>
> - ahci_fill_sg(qc);
> + ahci_fill_sg(qc, offset);
> }
>
> static void ahci_intr_error(struct ata_port *ap, u32 irq_stat)
> @@ -593,7 +613,70 @@
> printk(KERN_WARNING "ata%u: error occurred, port reset\n", ap->id);
> }
>
> -static void ahci_eng_timeout(struct ata_port *ap)
> +static void dump_log_page(unsigned char *p)
> +{
> + int i;
> +
> + printk("LOG 0x10: nq=%d, tag=%d\n", p[0] >> 7, p[0] & 0x1f);
> +
> + for (i = 2; i < 14; i++)
> + printk("%d:%d ", i, p[i]);
> +
> + printk("\n");
> +}
> +
> +static void ahci_host_ncq_intr_err(struct ata_port *ap)
> +{
> + struct ahci_port_priv *pp = ap->private_data;
> + void *mmio = ap->host_set->mmio_base;
> + void *port_mmio = ahci_port_base(mmio, ap->port_no);
> + unsigned long flags;
> + char *buffer;
> +
> + printk(KERN_ERR "ata%u: ncq interrupt error\n", ap->id);
> +
> + /*
> + * error all io first
> + */
> + spin_lock_irqsave(&ap->host_set->lock, flags);
> +
> + while (pp->sactive) {
> + struct ata_queued_cmd *qc;
> + int tag = ffs(pp->sactive) - 1;
> +
> + pp->sactive &= ~(1 << tag);
> + qc = ata_qc_from_tag(ap, tag);
> + if (qc)
> + ata_qc_complete(qc, ATA_ERR);

else printk error "I couldn't find the tag!"


> + }
> +
> + spin_unlock_irqrestore(&ap->host_set->lock, flags);
> +
> + ahci_intr_error(ap, readl(port_mmio + PORT_IRQ_STAT));
> +
> + buffer = kmalloc(512, GFP_KERNEL);
> + if (!buffer) {
> + printk(KERN_ERR "ata%u: unable to allocate memory for error\n", ap->id);
> + return;
> + }
> +
> + if (ata_read_log_page(ap, 0, 0x10, buffer, 1))
> + printk(KERN_ERR "ata%u: unable to read log page\n", ap->id);
> + else
> + dump_log_page(buffer);
> +
> + kfree(buffer);
> +}
> +
> +/*
> + * TODO: needs to use READ_LOG_EXT/page=10h to retrieve error information
> + */
> +static void ahci_ncq_timeout(struct ata_port *ap)
> +{
> + ahci_host_ncq_intr_err(ap);
> +}
> +
> +static void ahci_nonncq_timeout(struct ata_port *ap)
> {
> void *mmio = ap->host_set->mmio_base;
> void *port_mmio = ahci_port_base(mmio, ap->port_no);
> @@ -617,13 +700,65 @@
> qc->scsidone = scsi_finish_command;
> ata_qc_complete(qc, ATA_ERR);
> }
> +}
>
> +static void ahci_eng_timeout(struct ata_port *ap)
> +{
> + if (ap->ncq_depth)
> + ahci_ncq_timeout(ap);
> + else
> + ahci_nonncq_timeout(ap);
> }
>
> -static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
> +static inline void ahci_host_ncq_intr(struct ata_port *ap, u32 status)
> {
> + struct ahci_port_priv *pp = ap->private_data;
> void *mmio = ap->host_set->mmio_base;
> void *port_mmio = ahci_port_base(mmio, ap->port_no);
> + u32 tags, sactive;
> +
> + if (status & PORT_IRQ_D2H_REG_FIS) {
> + u8 *d2h_fis = pp->rx_fis + RX_FIS_D2H_REG;
> +
> + /*
> + * pre-BSY clear error, let timeout error handling take care
> + * of it when it kicks in
> + */
> + if (d2h_fis[2] & ATA_ERR)
> + return;
> + } else if (status & PORT_IRQ_SDB_FIS) {
> + u8 *sdb_fis = pp->rx_fis + RX_FIS_SDB_REG;
> +
> + if (sdb_fis[1] & ATA_ERR)
> + return;
> + }
> +
> + /*
> + * SActive will have the bits cleared for completed commands
> + */
> + sactive = readl(port_mmio + PORT_SCR_ACT);
> + tags = pp->sactive & ~sactive;
> +
> + while (tags) {
> + struct ata_queued_cmd *qc;
> + int tag = ffs(tags) - 1;
> +
> + tags &= ~(1 << tag);
> + qc = ata_qc_from_tag(ap, tag);
> + if (qc)
> + ata_qc_complete(qc, 0);
> + else
> + printk(KERN_ERR "ahci: missing tag %d\n", tag);
> + }
> +
> + pp->sactive = sactive;
> +}
> +
> +static inline int ahci_host_intr(struct ata_port *ap)
> +{
> + void *mmio = ap->host_set->mmio_base;
> + void *port_mmio = ahci_port_base(mmio, ap->port_no);
> + struct ata_queued_cmd *qc;
> u32 status, serr, ci;
>
> serr = readl(port_mmio + PORT_SCR_ERR);
> @@ -632,18 +767,27 @@
> status = readl(port_mmio + PORT_IRQ_STAT);
> writel(status, port_mmio + PORT_IRQ_STAT);
>
> - ci = readl(port_mmio + PORT_CMD_ISSUE);
> - if (likely((ci & 0x1) == 0)) {
> - if (qc) {
> - ata_qc_complete(qc, 0);
> - qc = NULL;
> + if (ap->ncq_depth) {
> + if (!serr)

incorrect test. serr is not the only error indicator.


> + ahci_host_ncq_intr(ap, status);
> + else
> + ahci_host_ncq_intr_err(ap);
> + } else {
> + ci = readl(port_mmio + PORT_CMD_ISSUE);
> + if (likely((ci & 0x1) == 0)) {
> + qc = ata_qc_from_tag(ap, ap->active_tag);
> + if (qc) {
> + ata_qc_complete(qc, 0);
> + qc = NULL;
> + }
> }
> - }
>
> - if (status & PORT_IRQ_FATAL) {
> - ahci_intr_error(ap, status);
> - if (qc)
> - ata_qc_complete(qc, ATA_ERR);
> + if (status & PORT_IRQ_FATAL) {
> + ahci_intr_error(ap, status);
> + qc = ata_qc_from_tag(ap, ap->active_tag);
> + if (qc)
> + ata_qc_complete(qc, ATA_ERR);
> + }
> }
>
> return 1;
> @@ -683,9 +827,7 @@
> ap = host_set->ports[i];
> tmp = irq_stat & (1 << i);
> if (tmp && ap) {
> - struct ata_queued_cmd *qc;
> - qc = ata_qc_from_tag(ap, ap->active_tag);
> - if (ahci_host_intr(ap, qc))
> + if (ahci_host_intr(ap))
> irq_ack |= (1 << i);
> }
> }
> @@ -707,10 +849,7 @@
> struct ata_port *ap = qc->ap;
> void *port_mmio = (void *) ap->ioaddr.cmd_addr;
>
> - writel(1, port_mmio + PORT_SCR_ACT);
> - readl(port_mmio + PORT_SCR_ACT); /* flush */
> -
> - writel(1, port_mmio + PORT_CMD_ISSUE);
> + writel(1 << qc->tag, port_mmio + PORT_CMD_ISSUE);
> readl(port_mmio + PORT_CMD_ISSUE); /* flush */
>
> return 0;

as mentioned above, both SActive and Cmd Issue should be written-to here.



> @@ -1027,6 +1166,9 @@
> if (rc)
> goto err_out_hpriv;
>
> + if (hpriv->cap & HOST_CAP_NCQ)
> + probe_ent->host_flags |= ATA_FLAG_NCQ;
> +
> ahci_print_info(probe_ent);
>
> /* FIXME: check ata_device_add return value */
> Index: drivers/scsi/libata-core.c
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-core.c (mode:100644)
> +++ uncommitted/drivers/scsi/libata-core.c (mode:100644)
> @@ -519,7 +519,7 @@
> * LOCKING:
> * None.
> */
> -static int ata_prot_to_cmd(int protocol, int lba48)
> +static int ata_prot_to_cmd(int protocol, int lba48, int ncq)
> {
> int rcmd = 0, wcmd = 0;
>
> @@ -535,7 +535,10 @@
> break;
>
> case ATA_PROT_DMA:
> - if (lba48) {
> + if (ncq) {
> + rcmd = ATA_CMD_FPDMA_READ;
> + wcmd = ATA_CMD_FPDMA_WRITE;
> + } else if (lba48) {
> rcmd = ATA_CMD_READ_EXT;
> wcmd = ATA_CMD_WRITE_EXT;
> } else {
> @@ -568,6 +571,7 @@
> {
> int pio = (dev->flags & ATA_DFLAG_PIO);
> int lba48 = (dev->flags & ATA_DFLAG_LBA48);
> + int ncq = (dev->flags & ATA_DFLAG_NCQ);
> int proto, cmd;
>
> if (pio)
> @@ -575,7 +579,7 @@
> else
> proto = dev->xfer_protocol = ATA_PROT_DMA;
>
> - cmd = ata_prot_to_cmd(proto, lba48);
> + cmd = ata_prot_to_cmd(proto, lba48, ncq);
> if (cmd < 0)
> BUG();
>
> @@ -1139,6 +1143,10 @@
> goto err_out_nosup;
> }
>
> + if (ap->flags & ATA_FLAG_NCQ)
> + if (ata_id_queue_depth(dev->id) > 1)
> + dev->flags |= ATA_DFLAG_NCQ;

This will turn on queueing for older TCQ devices that are plugged into
an NCQ-capable board.


> if (ata_id_has_lba48(dev->id)) {
> dev->flags |= ATA_DFLAG_LBA48;
> dev->n_sectors = ata_id_u64(dev->id, 100);
> @@ -1149,11 +1157,12 @@
> ap->host->max_cmd_len = 16;
>
> /* print device info to dmesg */
> - printk(KERN_INFO "ata%u: dev %u ATA, max %s, %Lu sectors:%s\n",
> + printk(KERN_INFO "ata%u: dev %u ATA, max %s, %Lu sectors:%s %s\n",
> ap->id, device,
> ata_mode_string(xfer_modes),
> (unsigned long long)dev->n_sectors,
> - dev->flags & ATA_DFLAG_LBA48 ? " lba48" : "");
> + dev->flags & ATA_DFLAG_LBA48 ? " lba48" : "",
> + dev->flags & ATA_DFLAG_NCQ ? "(NCQ)" : "");
> }
>
> /* ATAPI-specific feature tests */
> @@ -1187,6 +1196,74 @@
> }
>
> /**
> + * ata_read_log_page - read a specific log page
> + * @ap: port on which device we wish to probe resides
> + * @device: device bus address, starting at zero
> + * @page: page to read
> + * @buffer: where to store the read data
> + * @sectors: how much data to read
> + *
> + * After reading the device information page, we use several
> + * bits of information from it to initialize data structures
> + * that will be used during the lifetime of the ata_device.
> + * Other data from the info page is used to disqualify certain
> + * older ATA devices we do not wish to support.
> + *
> + * LOCKING:
> + * Grabs host_set lock.
> + */
> +
> +int ata_read_log_page(struct ata_port *ap, unsigned int device, char page,
> + char *buffer, unsigned int sectors)
> +{
> + struct ata_device *dev = &ap->device[device];
> + DECLARE_COMPLETION(wait);
> + struct ata_queued_cmd *qc;
> + unsigned long flags;
> + u8 status;
> + int rc;
> +
> + assert(dev->class == ATA_DEV_ATA);
> +
> + ata_dev_select(ap, device, 1, 1);

is this needed? These types of calls need to be removed, in general, as
they don't make sense on FIS-based hardware at all.


> + qc = ata_qc_new_init(ap, dev);
> + BUG_ON(qc == NULL);
> +
> + ata_sg_init_one(qc, buffer, sectors << 9);
> + qc->dma_dir = DMA_FROM_DEVICE;
> + qc->tf.protocol = ATA_PROT_PIO;
> + qc->nsect = sectors;
> +
> + qc->tf.command = ATA_CMD_READ_LOG_EXT;
> + qc->tf.nsect = sectors;
> + qc->tf.hob_nsect = sectors >> 8;
> + qc->tf.lbal = page;
> +
> + qc->waiting = &wait;
> + qc->complete_fn = ata_qc_complete_noop;
> +
> + printk("RLP issue\n");
> + spin_lock_irqsave(&ap->host_set->lock, flags);
> + rc = ata_qc_issue(qc);
> + spin_unlock_irqrestore(&ap->host_set->lock, flags);
> + printk("RLP issue done\n");
> +
> + if (rc)
> + return -EIO;
> +
> + wait_for_completion(&wait);
> +
> + printk("RLP wait done\n");
> +
> + status = ata_chk_status(ap);
> + if (status & (ATA_ERR | ATA_ABORTED))
> + return -EIO;

we need to get rid of this too for AHCI-like devices


> + return 0;
> +}
> +
> +/**
> * ata_bus_probe - Reset and probe ATA bus
> * @ap: Bus to probe
> *
> @@ -2753,6 +2830,16 @@
> struct ata_port *ap = qc->ap;
> unsigned int tag, do_clear = 0;
>
> + if (likely(qc->flags & ATA_QCFLAG_ACCOUNT)) {
> + if (qc->flags & ATA_QCFLAG_NCQ) {
> + assert(ap->ncq_depth);
> + ap->ncq_depth--;
> + } else {
> + assert(ap->depth);
> + ap->depth--;
> + }
> + }

why is this accounting conditional?


> qc->flags = 0;
> tag = qc->tag;
> if (likely(ata_tag_valid(tag))) {
> @@ -2770,6 +2857,8 @@
>
> if (likely(do_clear))
> clear_bit(tag, &ap->qactive);
> + if (ap->cmd_waiters)
> + wake_up(&ap->cmd_wait_queue);
> }
>
> /**
> @@ -2848,6 +2937,52 @@
> /* never reached */
> }
>
> +/*
> + * NCQ and non-NCQ commands are mutually exclusive. So we have to deny a
> + * request to queue a non-NCQ command, if we have NCQ commands in flight (and
> + * vice versa).
> + */
> +static inline int ata_qc_issue_ok(struct ata_port *ap,
> + struct ata_queued_cmd *qc, int waiting)
> +{
> + /*
> + * if people are already waiting for a queue drain, don't allow a
> + * new 'lucky' queuer to get in there
> + */
> + if (ap->cmd_waiters > waiting)
> + return 0;
> + if (qc->flags & ATA_QCFLAG_NCQ)
> + return !ap->depth;
> +
> + return !(ap->ncq_depth + ap->depth);
> +}
> +
> +static void ata_qc_issue_wait(struct ata_port *ap, struct ata_queued_cmd *qc)
> +{
> + DEFINE_WAIT(wait);
> +
> + ap->cmd_waiters++;
> +
> + do {
> + /*
> + * we rely on the FIFO order of the exclusive waitqueues
> + */
> + prepare_to_wait_exclusive(&ap->cmd_wait_queue, &wait,
> + TASK_UNINTERRUPTIBLE);
> +
> + if (!ata_qc_issue_ok(ap, qc, 1)) {
> + spin_unlock_irq(&ap->host_set->lock);
> + schedule();
> + spin_lock_irq(&ap->host_set->lock);
> + }
> +
> + finish_wait(&ap->cmd_wait_queue, &wait);
> +
> + } while (!ata_qc_issue_ok(ap, qc, 1));
> +
> + ap->cmd_waiters--;
> +}
> +
> /**
> * ata_qc_issue - issue taskfile to device
> * @qc: command to issue to device
> @@ -2867,6 +3002,21 @@
> int ata_qc_issue(struct ata_queued_cmd *qc)
> {
> struct ata_port *ap = qc->ap;
> + int rc;
> +
> + /*
> + * see if we can queue one more command at this point in time, see
> + * comment at ata_qc_issue_ok(). NCQ commands typically originate from
> + * the SCSI layer, we can ask the mid layer to defer those commands
> + * similar to a QUEUE_FULL condition on a 'real' SCSI device
> + */
> + if (!ata_qc_issue_ok(ap, qc, 0)) {
> + if (qc->flags & ATA_QCFLAG_DEFER)
> + return ATA_QC_ISSUE_DEFER;
> +
> + ata_qc_issue_wait(ap, qc);
> + assert(ata_qc_issue_ok(ap, qc, 0));
> + }
>
> if (ata_should_dma_map(qc)) {
> if (qc->flags & ATA_QCFLAG_SG) {
> @@ -2883,12 +3033,24 @@
> ap->ops->qc_prep(qc);
>
> qc->ap->active_tag = qc->tag;
> - qc->flags |= ATA_QCFLAG_ACTIVE;
> + qc->flags |= (ATA_QCFLAG_ACTIVE | ATA_QCFLAG_ACCOUNT);
> +
> + rc = ap->ops->qc_issue(qc);
> + if (rc != ATA_QC_ISSUE_OK)
> + return rc;
>
> - return ap->ops->qc_issue(qc);
> + if (qc->flags & ATA_QCFLAG_NCQ) {
> + assert(ap->ncq_depth < ATA_MAX_QUEUE)
> + ap->ncq_depth++;
> + } else {
> + assert(!ap->depth);
> + ap->depth++;
> + }
>
> + return ATA_QC_ISSUE_OK;
> err_out:
> - return -1;
> + ata_qc_free(qc);
> + return ATA_QC_ISSUE_FATAL;
> }
>
> /**
> @@ -2950,7 +3112,8 @@
>
> default:
> WARN_ON(1);
> - return -1;
> + ata_qc_free(qc);
> + return ATA_QC_ISSUE_FATAL;
> }
>
> return 0;
> @@ -3382,6 +3545,8 @@
> ap->ops = ent->port_ops;
> ap->cbl = ATA_CBL_NONE;
> ap->active_tag = ATA_TAG_POISON;
> + init_waitqueue_head(&ap->cmd_wait_queue);
> + ap->cmd_waiters = 0;
> ap->last_ctl = 0xFF;
>
> INIT_WORK(&ap->packet_task, atapi_packet_task, ap);
> @@ -4017,6 +4182,7 @@
> EXPORT_SYMBOL_GPL(ata_dev_classify);
> EXPORT_SYMBOL_GPL(ata_dev_id_string);
> EXPORT_SYMBOL_GPL(ata_scsi_simulate);
> +EXPORT_SYMBOL_GPL(ata_read_log_page);
>
> #ifdef CONFIG_PCI
> EXPORT_SYMBOL_GPL(pci_test_config_bits);
> Index: drivers/scsi/libata-scsi.c
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-scsi.c (mode:100644)
> +++ uncommitted/drivers/scsi/libata-scsi.c (mode:100644)
> @@ -336,6 +336,7 @@
> if (sdev->id < ATA_MAX_DEVICES) {
> struct ata_port *ap;
> struct ata_device *dev;
> + int depth;
>
> ap = (struct ata_port *) &sdev->host->hostdata[0];
> dev = &ap->device[sdev->id];
> @@ -353,6 +354,13 @@
> */
> blk_queue_max_sectors(sdev->request_queue, 2048);
> }
> +
> + if (dev->flags & ATA_DFLAG_NCQ) {
> + int ddepth = ata_id_queue_depth(dev->id) + 1;
> +
> + depth = min(sdev->host->can_queue, ddepth);
> + scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG, depth);

For all hardware that uses SActive (all NCQ), the max is 31 not 32.

32 tags == 0xffffffff SActive, which is the same value as that which
occurs when the hardware is dead/unplugged/etc.

Also... NCQ does not provide ordered tags. I think MSG_SIMPLE_TAG is
more appropriate.


> + }
> }
>
> return 0; /* scsi layer doesn't check return value, sigh */
> @@ -537,6 +545,7 @@
> {
> struct ata_taskfile *tf = &qc->tf;
> unsigned int lba48 = tf->flags & ATA_TFLAG_LBA48;
> + unsigned int ncq = qc->dev->flags & ATA_DFLAG_NCQ;
>
> tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
> tf->protocol = qc->dev->xfer_protocol;
> @@ -550,8 +559,18 @@
> tf->flags |= ATA_TFLAG_WRITE;
> }
>
> + if (ncq)
> + qc->flags |= ATA_QCFLAG_NCQ;
> +
> if (scsicmd[0] == READ_10 || scsicmd[0] == WRITE_10) {
> - if (lba48) {
> + if (ncq) {
> + tf->hob_feature = scsicmd[7];
> + tf->feature = scsicmd[8];
> + tf->nsect = qc->tag << 3;
> + tf->hob_lbal = scsicmd[2];
> + qc->nsect = ((unsigned int)scsicmd[7] << 8) |
> + scsicmd[8];
> + } else if (lba48) {
> tf->hob_nsect = scsicmd[7];
> tf->hob_lbal = scsicmd[2];
>
> @@ -569,7 +588,8 @@
> qc->nsect = scsicmd[8];
> }
>
> - tf->nsect = scsicmd[8];
> + if (!ncq)
> + tf->nsect = scsicmd[8];

the other changes seem fine, but this seem strange


> tf->lbal = scsicmd[5];
> tf->lbam = scsicmd[4];
> tf->lbah = scsicmd[3];
> @@ -579,7 +599,14 @@
> }
>
> if (scsicmd[0] == READ_6 || scsicmd[0] == WRITE_6) {
> - qc->nsect = tf->nsect = scsicmd[4];
> + qc->nsect = scsicmd[4];
> +
> + if (ncq) {
> + tf->nsect = qc->tag << 3;
> + tf->feature = scsicmd[4];
> + } else
> + tf->nsect = scsicmd[4];
> +
> tf->lbal = scsicmd[3];
> tf->lbam = scsicmd[2];
> tf->lbah = scsicmd[1] & 0x1f; /* mask out reserved bits */
> @@ -593,7 +620,16 @@
> if (scsicmd[2] || scsicmd[3] || scsicmd[10] || scsicmd[11])
> return 1;
>
> - if (lba48) {
> + if (ncq) {
> + tf->hob_feature = scsicmd[13];
> + tf->feature = scsicmd[12];
> + tf->nsect = qc->tag << 3;
> + tf->hob_lbal = scsicmd[6];
> + tf->hob_lbam = scsicmd[5];
> + tf->hob_lbah = scsicmd[4];
> + qc->nsect = ((unsigned int)scsicmd[12] << 8) |
> + scsicmd[13];
> + } else if (lba48) {
> tf->hob_nsect = scsicmd[12];
> tf->hob_lbal = scsicmd[6];
> tf->hob_lbam = scsicmd[5];
> @@ -613,7 +649,8 @@
> qc->nsect = scsicmd[13];
> }
>
> - tf->nsect = scsicmd[13];
> + if (!ncq)
> + tf->nsect = scsicmd[13];
> tf->lbal = scsicmd[9];
> tf->lbam = scsicmd[8];
> tf->lbah = scsicmd[7];
> @@ -666,6 +703,7 @@
> {
> struct ata_queued_cmd *qc;
> u8 *scsicmd = cmd->cmnd;
> + int ret;
>
> VPRINTK("ENTER\n");
>
> @@ -696,9 +734,18 @@
> if (xlat_func(qc, scsicmd))
> goto err_out;
>
> + qc->flags |= ATA_QCFLAG_DEFER;
> +
> /* select device, send command to hardware */
> - if (ata_qc_issue(qc))
> + ret = ata_qc_issue(qc);
> + if (ret == ATA_QC_ISSUE_FATAL)
> goto err_out;
> + else if (ret == ATA_QC_ISSUE_DEFER) {
> + VPRINTK("DEFER\n");
> + ata_qc_free(qc);
> + cmd->result = (DID_OK << 16) | (QUEUE_FULL << 1);
> + done(cmd);
> + }
>
> VPRINTK("EXIT\n");
> return;
> Index: drivers/scsi/scsi_lib.c
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/scsi_lib.c (mode:100644)
> +++ uncommitted/drivers/scsi/scsi_lib.c (mode:100644)
> @@ -1292,13 +1292,17 @@
> shost = sdev->host;
> while (!blk_queue_plugged(q)) {
> int rtn;
> +
> + if (!scsi_dev_queue_ready(q, sdev))
> + break;
> +
> /*
> * get next queueable request. We do this early to make sure
> * that the request is fully prepared even if we cannot
> * accept it.
> */
> req = elv_next_request(q);
> - if (!req || !scsi_dev_queue_ready(q, sdev))
> + if (!req)
> break;
>
> if (unlikely(!scsi_device_online(sdev))) {
> Index: drivers/scsi/scsi_sysfs.c
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/scsi_sysfs.c (mode:100644)
> +++ uncommitted/drivers/scsi/scsi_sysfs.c (mode:100644)
> @@ -309,12 +309,35 @@
> * Create the actual show/store functions and data structures.
> */
> sdev_rd_attr (device_blocked, "%d\n");
> -sdev_rd_attr (queue_depth, "%d\n");
> sdev_rd_attr (type, "%d\n");
> sdev_rd_attr (scsi_level, "%d\n");
> sdev_rd_attr (vendor, "%.8s\n");
> sdev_rd_attr (model, "%.16s\n");
> sdev_rd_attr (rev, "%.4s\n");
> +sdev_rd_attr (device_busy, "%d\n");
> +
> +static ssize_t
> +sdev_show_queue_depth(struct device *dev, char *buf)
> +{
> + struct scsi_device *sdev = to_scsi_device(dev);
> +
> + return snprintf (buf, 20, "%d\n", sdev->queue_depth);
> +}
> +
> +static ssize_t
> +sdev_store_queue_depth(struct device *dev, const char *buf, size_t count)
> +{
> + struct scsi_device *sdev = to_scsi_device(dev);
> + int depth;
> +
> + sscanf(buf, "%d\n", &depth);
> +
> + if (depth <= sdev->host->can_queue)
> + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), depth);
> +
> + return count;
> +}
> +static DEVICE_ATTR(queue_depth, S_IRUGO | S_IWUSR, sdev_show_queue_depth, sdev_store_queue_depth);
>
> static ssize_t
> sdev_show_timeout (struct device *dev, char *buf)
> @@ -446,6 +469,7 @@
> &dev_attr_iorequest_cnt,
> &dev_attr_iodone_cnt,
> &dev_attr_ioerr_cnt,
> + &dev_attr_device_busy,
> NULL
> };
>
> Index: include/linux/ata.h
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/include/linux/ata.h (mode:100644)
> +++ uncommitted/include/linux/ata.h (mode:100644)
> @@ -79,7 +79,8 @@
> ATA_NIEN = (1 << 1), /* disable-irq flag */
> ATA_LBA = (1 << 6), /* LBA28 selector */
> ATA_DEV1 = (1 << 4), /* Select Device 1 (slave) */
> - ATA_DEVICE_OBS = (1 << 7) | (1 << 5), /* obs bits in dev reg */
> + ATA_DEVICE_OBS = (1 << 5), /* obs bits in dev reg */
> + ATA_FPDMA_FUA = (1 << 7), /* FUA cache bypass bit */
> ATA_DEVCTL_OBS = (1 << 3), /* obsolete bit in devctl reg */
> ATA_BUSY = (1 << 7), /* BSY status bit */
> ATA_DRDY = (1 << 6), /* device ready */
> @@ -125,6 +126,12 @@
> ATA_CMD_PACKET = 0xA0,
> ATA_CMD_VERIFY = 0x40,
> ATA_CMD_VERIFY_EXT = 0x42,
> + ATA_CMD_FPDMA_READ = 0x60,
> + ATA_CMD_FPDMA_WRITE = 0x61,
> + ATA_CMD_READ_LOG_EXT = 0x2f,
> +
> + /* READ_LOG_EXT pages */
> + READ_LOG_SATA_NCQ_PAGE = 0x10,
>
> /* SETFEATURES stuff */
> SETFEATURES_XFER = 0x03,
> @@ -234,6 +241,7 @@
> #define ata_id_has_lba(id) ((id)[49] & (1 << 9))
> #define ata_id_has_dma(id) ((id)[49] & (1 << 8))
> #define ata_id_removeable(id) ((id)[0] & (1 << 7))
> +#define ata_id_queue_depth(id) ((id)[75] & 0x1f)
> #define ata_id_u32(id,n) \
> (((u32) (id)[(n) + 1] << 16) | ((u32) (id)[(n)]))
> #define ata_id_u64(id,n) \
> Index: include/linux/libata.h
> ===================================================================
> --- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/include/linux/libata.h (mode:100644)
> +++ uncommitted/include/linux/libata.h (mode:100644)
> @@ -80,7 +80,7 @@
> LIBATA_MAX_PRD = ATA_MAX_PRD / 2,
> ATA_MAX_PORTS = 8,
> ATA_DEF_QUEUE = 1,
> - ATA_MAX_QUEUE = 1,
> + ATA_MAX_QUEUE = 32,
> ATA_MAX_SECTORS = 200, /* FIXME */
> ATA_MAX_BUS = 2,
> ATA_DEF_BUSY_WAIT = 10000,
> @@ -95,6 +95,7 @@
> ATA_DFLAG_LBA48 = (1 << 0), /* device supports LBA48 */
> ATA_DFLAG_PIO = (1 << 1), /* device currently in PIO mode */
> ATA_DFLAG_LOCK_SECTORS = (1 << 2), /* don't adjust max_sectors */
> + ATA_DFLAG_NCQ = (1 << 3), /* Device can do NCQ */
>
> ATA_DEV_UNKNOWN = 0, /* unknown device */
> ATA_DEV_ATA = 1, /* ATA device */
> @@ -113,11 +114,19 @@
> ATA_FLAG_MMIO = (1 << 6), /* use MMIO, not PIO */
> ATA_FLAG_SATA_RESET = (1 << 7), /* use COMRESET */
> ATA_FLAG_PIO_DMA = (1 << 8), /* PIO cmds via DMA */
> + ATA_FLAG_NCQ = (1 << 9), /* Can do NCQ */
>
> ATA_QCFLAG_ACTIVE = (1 << 1), /* cmd not yet ack'd to scsi lyer */
> ATA_QCFLAG_SG = (1 << 3), /* have s/g table? */
> ATA_QCFLAG_SINGLE = (1 << 4), /* no s/g, just a single buffer */
> ATA_QCFLAG_DMAMAP = ATA_QCFLAG_SG | ATA_QCFLAG_SINGLE,
> + ATA_QCFLAG_NCQ = (1 << 5), /* using NCQ */
> + ATA_QCFLAG_ACCOUNT = (1 << 6), /* depth accounted */
> + ATA_QCFLAG_DEFER = (1 << 7), /* ok to defer */
> +
> + ATA_QC_ISSUE_OK = 0,
> + ATA_QC_ISSUE_DEFER = 1,
> + ATA_QC_ISSUE_FATAL = 2,
>
> /* various lengths of time */
> ATA_TMOUT_EDD = 5 * HZ, /* hueristic */
> @@ -308,6 +317,11 @@
> struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
> unsigned long qactive;
> unsigned int active_tag;
> + int ncq_depth;
> + int depth;

I don't think we need two depths


> + wait_queue_head_t cmd_wait_queue;
> + unsigned int cmd_waiters;
>
> struct ata_host_stats stats;
> struct ata_host_set *host_set;
> @@ -433,6 +447,8 @@
> struct block_device *bdev,
> sector_t capacity, int geom[]);
> extern int ata_scsi_slave_config(struct scsi_device *sdev);
> +extern int ata_read_log_page(struct ata_port *, unsigned int, char, char *,
> + unsigned int);
>
>
> #ifdef CONFIG_PCI
>

2005-05-26 17:12:17

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Jeff Garzik wrote:
> > static void ahci_qc_prep(struct ata_queued_cmd *qc)
> > {
> > struct ahci_port_priv *pp = qc->ap->private_data;
> >- u32 opts;
> >+ void *port_mmio = (void *) qc->ap->ioaddr.cmd_addr;
> > const u32 cmd_fis_len = 5; /* five dwords */
> >+ dma_addr_t cmd_tbl_dma;
> >+ u32 opts;
> >+ int offset;
> >+
> >+ if (qc->flags & ATA_QCFLAG_NCQ) {
> >+ pp->sactive |= (1 << qc->tag);
> >+
> >+ writel(1 << qc->tag, port_mmio + PORT_SCR_ACT);
> >+ readl(port_mmio + PORT_SCR_ACT); /* flush */
> >+ }
>
> Wrong, you should do this in ahci_qc_issue not here.

Are you sure, I moved this on purpose? I think the reason I did this was
the wording at the back of the the sata-ii spec (appendix b) that says
something ala 'preset the active bit and transmit a register FIS'. Feel
free to point me at the authoritative wording in the ACHI spec.

One thing that I definitely think _was_ wrong with the sactive bit
before, is that you set it unconditionally of whether this was an NCQ
command or not. The maxtor drives don't clear sactive on non-fpdma
commands, which confused me at first.

> >+static void ahci_host_ncq_intr_err(struct ata_port *ap)
> >+{
> >+ struct ahci_port_priv *pp = ap->private_data;
> >+ void *mmio = ap->host_set->mmio_base;
> >+ void *port_mmio = ahci_port_base(mmio, ap->port_no);
> >+ unsigned long flags;
> >+ char *buffer;
> >+
> >+ printk(KERN_ERR "ata%u: ncq interrupt error\n", ap->id);
> >+
> >+ /*
> >+ * error all io first
> >+ */
> >+ spin_lock_irqsave(&ap->host_set->lock, flags);
> >+
> >+ while (pp->sactive) {
> >+ struct ata_queued_cmd *qc;
> >+ int tag = ffs(pp->sactive) - 1;
> >+
> >+ pp->sactive &= ~(1 << tag);
> >+ qc = ata_qc_from_tag(ap, tag);
> >+ if (qc)
> >+ ata_qc_complete(qc, ATA_ERR);
>
> else printk error "I couldn't find the tag!"

Agree.

> >@@ -632,18 +767,27 @@
> > status = readl(port_mmio + PORT_IRQ_STAT);
> > writel(status, port_mmio + PORT_IRQ_STAT);
> >
> >- ci = readl(port_mmio + PORT_CMD_ISSUE);
> >- if (likely((ci & 0x1) == 0)) {
> >- if (qc) {
> >- ata_qc_complete(qc, 0);
> >- qc = NULL;
> >+ if (ap->ncq_depth) {
> >+ if (!serr)
>
> incorrect test. serr is not the only error indicator.

Yes, I'm aware of that. There are still quite a few loose ends wrt error
handling, as mentioned. I also don't check the irq fatal stat in the ncq
interrupt handler.

> >@@ -1139,6 +1143,10 @@
> > goto err_out_nosup;
> > }
> >
> >+ if (ap->flags & ATA_FLAG_NCQ)
> >+ if (ata_id_queue_depth(dev->id) > 1)
> >+ dev->flags |= ATA_DFLAG_NCQ;
>
> This will turn on queueing for older TCQ devices that are plugged into
> an NCQ-capable board.

Yup, that is an error. At that time, I could not easily find what
feature bit encompassed NCQ, that needs to be added of course.

> >+int ata_read_log_page(struct ata_port *ap, unsigned int device, char page,
> >+ char *buffer, unsigned int sectors)
> >+{
> >+ struct ata_device *dev = &ap->device[device];
> >+ DECLARE_COMPLETION(wait);
> >+ struct ata_queued_cmd *qc;
> >+ unsigned long flags;
> >+ u8 status;
> >+ int rc;
> >+
> >+ assert(dev->class == ATA_DEV_ATA);
> >+
> >+ ata_dev_select(ap, device, 1, 1);
>
> is this needed? These types of calls need to be removed, in general, as
> they don't make sense on FIS-based hardware at all.

You tell me, this read_log_page() was mainly copy-pasted from the pio
driven function above it. I'll try and kill the select when doing error
testing.

> >+ printk("RLP issue\n");
> >+ spin_lock_irqsave(&ap->host_set->lock, flags);
> >+ rc = ata_qc_issue(qc);
> >+ spin_unlock_irqrestore(&ap->host_set->lock, flags);
> >+ printk("RLP issue done\n");
> >+
> >+ if (rc)
> >+ return -EIO;
> >+
> >+ wait_for_completion(&wait);
> >+
> >+ printk("RLP wait done\n");
> >+
> >+ status = ata_chk_status(ap);
> >+ if (status & (ATA_ERR | ATA_ABORTED))
> >+ return -EIO;
>
> we need to get rid of this too for AHCI-like devices

Can you expand on that?

> >@@ -2753,6 +2830,16 @@
> > struct ata_port *ap = qc->ap;
> > unsigned int tag, do_clear = 0;
> >
> >+ if (likely(qc->flags & ATA_QCFLAG_ACCOUNT)) {
> >+ if (qc->flags & ATA_QCFLAG_NCQ) {
> >+ assert(ap->ncq_depth);
> >+ ap->ncq_depth--;
> >+ } else {
> >+ assert(ap->depth);
> >+ ap->depth--;
> >+ }
> >+ }
>
> why is this accounting conditional?

Because if you free a qc before it has gone to the device, you don't
want to account it. Hmm, perhaps ACTIVE would work fine in fact. I'll
double check!

> > #ifdef CONFIG_PCI
> > EXPORT_SYMBOL_GPL(pci_test_config_bits);
> >Index: drivers/scsi/libata-scsi.c
> >===================================================================
> >--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-scsi.c
> >(mode:100644)
> >+++ uncommitted/drivers/scsi/libata-scsi.c (mode:100644)
> >@@ -336,6 +336,7 @@
> > if (sdev->id < ATA_MAX_DEVICES) {
> > struct ata_port *ap;
> > struct ata_device *dev;
> >+ int depth;
> >
> > ap = (struct ata_port *) &sdev->host->hostdata[0];
> > dev = &ap->device[sdev->id];
> >@@ -353,6 +354,13 @@
> > */
> > blk_queue_max_sectors(sdev->request_queue, 2048);
> > }
> >+
> >+ if (dev->flags & ATA_DFLAG_NCQ) {
> >+ int ddepth = ata_id_queue_depth(dev->id) + 1;
> >+
> >+ depth = min(sdev->host->can_queue, ddepth);
> >+ scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG,
> >depth);
>
> For all hardware that uses SActive (all NCQ), the max is 31 not 32.

That's not true, the max is 32 counting 0 as a valid tag. So 31 is
indeed th max tag value, but 32 is the depth.

> 32 tags == 0xffffffff SActive, which is the same value as that which
> occurs when the hardware is dead/unplugged/etc.

Well you would think you would notice the hardware going dead by other
means than just SActive, right? Seems silly to cap the depth because of
that.

> Also... NCQ does not provide ordered tags. I think MSG_SIMPLE_TAG is
> more appropriate.

Yep indeed, thanks.

> >+ if (ncq)
> >+ qc->flags |= ATA_QCFLAG_NCQ;
> >+
> > if (scsicmd[0] == READ_10 || scsicmd[0] == WRITE_10) {
> >- if (lba48) {
> >+ if (ncq) {
> >+ tf->hob_feature = scsicmd[7];
> >+ tf->feature = scsicmd[8];
> >+ tf->nsect = qc->tag << 3;
> >+ tf->hob_lbal = scsicmd[2];
> >+ qc->nsect = ((unsigned int)scsicmd[7] << 8) |
> >+ scsicmd[8];
> >+ } else if (lba48) {
> > tf->hob_nsect = scsicmd[7];
> > tf->hob_lbal = scsicmd[2];
> >
> >@@ -569,7 +588,8 @@
> > qc->nsect = scsicmd[8];
> > }
> >
> >- tf->nsect = scsicmd[8];
> >+ if (!ncq)
> >+ tf->nsect = scsicmd[8];
>
> the other changes seem fine, but this seem strange

How so? FPDMA has the tag in nsect, others store the bottom 8 bits of
lba.

> >@@ -308,6 +317,11 @@
> > struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
> > unsigned long qactive;
> > unsigned int active_tag;
> >+ int ncq_depth;
> >+ int depth;
>
> I don't think we need two depths

The two depths were added because we need to differentiate between the
two for issuing new commands. ncq_depth > 0 is fine for issuing a new
FPDMA request, where as non-FPDMA commands need both !ncq_depth and
!depth.

Thanks for your comments!

--
Jens Axboe

2005-05-26 17:16:30

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Jens Axboe wrote:
> On Thu, May 26 2005, Jeff Garzik wrote:
> > > static void ahci_qc_prep(struct ata_queued_cmd *qc)
> > > {
> > > struct ahci_port_priv *pp = qc->ap->private_data;
> > >- u32 opts;
> > >+ void *port_mmio = (void *) qc->ap->ioaddr.cmd_addr;
> > > const u32 cmd_fis_len = 5; /* five dwords */
> > >+ dma_addr_t cmd_tbl_dma;
> > >+ u32 opts;
> > >+ int offset;
> > >+
> > >+ if (qc->flags & ATA_QCFLAG_NCQ) {
> > >+ pp->sactive |= (1 << qc->tag);
> > >+
> > >+ writel(1 << qc->tag, port_mmio + PORT_SCR_ACT);
> > >+ readl(port_mmio + PORT_SCR_ACT); /* flush */
> > >+ }
> >
> > Wrong, you should do this in ahci_qc_issue not here.
>
> Are you sure, I moved this on purpose? I think the reason I did this was
> the wording at the back of the the sata-ii spec (appendix b) that says
> something ala 'preset the active bit and transmit a register FIS'. Feel
> free to point me at the authoritative wording in the ACHI spec.
>
> One thing that I definitely think _was_ wrong with the sactive bit
> before, is that you set it unconditionally of whether this was an NCQ
> command or not. The maxtor drives don't clear sactive on non-fpdma
> commands, which confused me at first.

Re-reading AHCI spec, it does indicate that you want to set SActive
after building the command. I'll move it back, but keep the conditional
of setting SActive on queued commands.

--
Jens Axboe

2005-05-26 17:17:02

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> Re-reading AHCI spec, it does indicate that you want to set SActive
> after building the command. I'll move it back, but keep the conditional
> of setting SActive on queued commands.

SActive is intentionally used for non-NCQ devices. The SATA registers
are -host- registers not -device- registers, remember.

At the very least, I would like to see a lot of testing before you make
the current unconditional code conditional.

Jeff


2005-05-26 17:32:13

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >Re-reading AHCI spec, it does indicate that you want to set SActive
> >after building the command. I'll move it back, but keep the conditional
> >of setting SActive on queued commands.
>
> SActive is intentionally used for non-NCQ devices. The SATA registers
> are -host- registers not -device- registers, remember.

But the host sets SActive only, the device clears it. And at least the
maxtor drives don't clear SActive on non-NCQ commands, which makes
things really confusing once you have completed a non-NCQ command and
start doing some NCQ ones. Page 59 of the AHCI 1.1 spec reads:

4. If it is a queued command, software shall first set
PxSACT.DS(pFreeSlot). [...]

So I really do think this is an error in ahci.c since the beginning.

> At the very least, I would like to see a lot of testing before you make
> the current unconditional code conditional.

Perhaps you could include such a patch in libata-dev for a while, if you
wish? I really cannot remove it from the NCQ patch.

--
Jens Axboe

2005-05-26 19:49:56

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> On Thu, May 26 2005, Jeff Garzik wrote:
>>>+int ata_read_log_page(struct ata_port *ap, unsigned int device, char page,
>>>+ char *buffer, unsigned int sectors)
>>>+{
>>>+ struct ata_device *dev = &ap->device[device];
>>>+ DECLARE_COMPLETION(wait);
>>>+ struct ata_queued_cmd *qc;
>>>+ unsigned long flags;
>>>+ u8 status;
>>>+ int rc;
>>>+
>>>+ assert(dev->class == ATA_DEV_ATA);
>>>+
>>>+ ata_dev_select(ap, device, 1, 1);
>>
>>is this needed? These types of calls need to be removed, in general, as
>>they don't make sense on FIS-based hardware at all.
>
>
> You tell me, this read_log_page() was mainly copy-pasted from the pio
> driven function above it. I'll try and kill the select when doing error
> testing.
>
>
>>>+ printk("RLP issue\n");
>>>+ spin_lock_irqsave(&ap->host_set->lock, flags);
>>>+ rc = ata_qc_issue(qc);
>>>+ spin_unlock_irqrestore(&ap->host_set->lock, flags);
>>>+ printk("RLP issue done\n");
>>>+
>>>+ if (rc)
>>>+ return -EIO;
>>>+
>>>+ wait_for_completion(&wait);
>>>+
>>>+ printk("RLP wait done\n");
>>>+
>>>+ status = ata_chk_status(ap);
>>>+ if (status & (ATA_ERR | ATA_ABORTED))
>>>+ return -EIO;
>>
>>we need to get rid of this too for AHCI-like devices
>
>
> Can you expand on that?

(this covers both quoted questions above)

The PIO function assumes that PCI IDE-like ATA register blocks (command
registers, control registers) are available. The read-log-page function
can make no such assumptions.

dev-select and check-status should both be done by the machinery that
occurs once you start things in motion by calling ata_qc_issue().

Doing things this way is necessary for FIS-based hardware like AHCI or
SiI 3124.


>>>#ifdef CONFIG_PCI
>>>EXPORT_SYMBOL_GPL(pci_test_config_bits);
>>>Index: drivers/scsi/libata-scsi.c
>>>===================================================================
>>>--- f5c58b6b0cfd2a92fb3b1d1f4cbfdfb3df6f45d6/drivers/scsi/libata-scsi.c
>>>(mode:100644)
>>>+++ uncommitted/drivers/scsi/libata-scsi.c (mode:100644)
>>>@@ -336,6 +336,7 @@
>>> if (sdev->id < ATA_MAX_DEVICES) {
>>> struct ata_port *ap;
>>> struct ata_device *dev;
>>>+ int depth;
>>>
>>> ap = (struct ata_port *) &sdev->host->hostdata[0];
>>> dev = &ap->device[sdev->id];
>>>@@ -353,6 +354,13 @@
>>> */
>>> blk_queue_max_sectors(sdev->request_queue, 2048);
>>> }
>>>+
>>>+ if (dev->flags & ATA_DFLAG_NCQ) {
>>>+ int ddepth = ata_id_queue_depth(dev->id) + 1;
>>>+
>>>+ depth = min(sdev->host->can_queue, ddepth);
>>>+ scsi_adjust_queue_depth(sdev, MSG_ORDERED_TAG,
>>>depth);
>>
>>For all hardware that uses SActive (all NCQ), the max is 31 not 32.
>
>
> That's not true, the max is 32 counting 0 as a valid tag. So 31 is
> indeed th max tag value, but 32 is the depth.

I was talking about depth. In libata, it's a policy decision to never
use more than 31 tags at any given time.

You can change it from 31 to 32 in SuSE for value-add, if you wish :)

Note also that error handling occasionally needs a command slot, so the
limit may even be 30 (or 31 at most).



> The two depths were added because we need to differentiate between the
> two for issuing new commands. ncq_depth > 0 is fine for issuing a new
> FPDMA request, where as non-FPDMA commands need both !ncq_depth and
> !depth.

You can definitely handle both FPDMA and non-FPDMA with a single
variable. Think harder on this one. You have flags to work with, you
know...

Jeff


2005-05-26 20:31:14

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >On Thu, May 26 2005, Jeff Garzik wrote:
> >>>+int ata_read_log_page(struct ata_port *ap, unsigned int device, char
> >>>page,
> >>>+ char *buffer, unsigned int sectors)
> >>>+{
> >>>+ struct ata_device *dev = &ap->device[device];
> >>>+ DECLARE_COMPLETION(wait);
> >>>+ struct ata_queued_cmd *qc;
> >>>+ unsigned long flags;
> >>>+ u8 status;
> >>>+ int rc;
> >>>+
> >>>+ assert(dev->class == ATA_DEV_ATA);
> >>>+
> >>>+ ata_dev_select(ap, device, 1, 1);
> >>
> >>is this needed? These types of calls need to be removed, in general, as
> >>they don't make sense on FIS-based hardware at all.
> >
> >
> >You tell me, this read_log_page() was mainly copy-pasted from the pio
> >driven function above it. I'll try and kill the select when doing error
> >testing.
> >
> >
> >>>+ printk("RLP issue\n");
> >>>+ spin_lock_irqsave(&ap->host_set->lock, flags);
> >>>+ rc = ata_qc_issue(qc);
> >>>+ spin_unlock_irqrestore(&ap->host_set->lock, flags);
> >>>+ printk("RLP issue done\n");
> >>>+
> >>>+ if (rc)
> >>>+ return -EIO;
> >>>+
> >>>+ wait_for_completion(&wait);
> >>>+
> >>>+ printk("RLP wait done\n");
> >>>+
> >>>+ status = ata_chk_status(ap);
> >>>+ if (status & (ATA_ERR | ATA_ABORTED))
> >>>+ return -EIO;
> >>
> >>we need to get rid of this too for AHCI-like devices
> >
> >
> >Can you expand on that?
>
> (this covers both quoted questions above)
>
> The PIO function assumes that PCI IDE-like ATA register blocks (command
> registers, control registers) are available. The read-log-page function
> can make no such assumptions.
>
> dev-select and check-status should both be done by the machinery that
> occurs once you start things in motion by calling ata_qc_issue().
>
> Doing things this way is necessary for FIS-based hardware like AHCI or
> SiI 3124.

I'll get it cleaned up and tested.

> >>For all hardware that uses SActive (all NCQ), the max is 31 not 32.
> >
> >
> >That's not true, the max is 32 counting 0 as a valid tag. So 31 is
> >indeed th max tag value, but 32 is the depth.
>
> I was talking about depth. In libata, it's a policy decision to never
> use more than 31 tags at any given time.
>
> You can change it from 31 to 32 in SuSE for value-add, if you wish :)
>
> Note also that error handling occasionally needs a command slot, so the
> limit may even be 30 (or 31 at most).

Reserving one for error handling makes a lot of sense, that's a good
point. I still don't buy your argument for not using the full 32 slots
in total, though. But in the end I don't suppose it matters a lot, 31 or
30 for queue depth is very much at the end of the spectrum of
diminishing return in difference.

> >The two depths were added because we need to differentiate between the
> >two for issuing new commands. ncq_depth > 0 is fine for issuing a new
> >FPDMA request, where as non-FPDMA commands need both !ncq_depth and
> >!depth.
>
> You can definitely handle both FPDMA and non-FPDMA with a single
> variable. Think harder on this one. You have flags to work with, you
> know...

Of course it is possible, but I would rather trade 4-bytes of space to
get clearer code than switching a flag on/off all the time and counting
a single depth. But hey that's trivial detail, compared to what else is
missing. When this becomes the most important point, we can take it up
again :-)

--
Jens Axboe

2005-05-26 21:50:39

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

I also saw a good boost from NCQ on the qstor driver (full version,
not the libata subset) last year. Very good for busy servers
and RAID arrays.

Jens Axboe wrote:
+ do {
+ /*
+ * we rely on the FIFO order of the exclusive waitqueues
+ */
+ prepare_to_wait_exclusive(&ap->cmd_wait_queue, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!ata_qc_issue_ok(ap, qc, 1)) {
+ spin_unlock_irq(&ap->host_set->lock);
+ schedule();
+ spin_lock_irq(&ap->host_set->lock);
+ }
+
+ finish_wait(&ap->cmd_wait_queue, &wait);
+
+ } while (!ata_qc_issue_ok(ap, qc, 1));

In this bit (above), is it possible for this code to ever
be invoked from a SCHED_RR or SCHED_FIFO context?

If so, it will lock out all lower-priority processes
for the duration of the polling interval.

Cheers

2005-05-27 04:42:10

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> @@ -189,7 +194,7 @@
> .ioctl = ata_scsi_ioctl,
> .queuecommand = ata_scsi_queuecmd,
> .eh_strategy_handler = ata_scsi_error,
> - .can_queue = ATA_DEF_QUEUE,
> + .can_queue = ATA_MAX_QUEUE,
> .this_id = ATA_SHT_THIS_ID,
> .sg_tablesize = AHCI_MAX_SG,
> .max_sectors = ATA_MAX_SECTORS,
> @@ -200,7 +205,7 @@
> .dma_boundary = AHCI_DMA_BOUNDARY,
> .slave_configure = ata_scsi_slave_config,
> .bios_param = ata_std_bios_param,
> - .ordered_flush = 1,
> + .ordered_flush = 0, /* conflicts with NCQ for now */
> };
>
> static struct ata_port_operations ahci_ops = {

Also, you'll probably want to implement ata_scsi_change_queue_depth, and
reference it in ahci.c (and other NCQ-capable drivers).

Jeff


2005-05-27 06:27:14

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Mark Lord wrote:
> I also saw a good boost from NCQ on the qstor driver (full version,
> not the libata subset) last year. Very good for busy servers
> and RAID arrays.

It does seem to work amazingly well, from a performance point of view.

> Jens Axboe wrote:
> + do {
> + /*
> + * we rely on the FIFO order of the exclusive waitqueues
> + */
> + prepare_to_wait_exclusive(&ap->cmd_wait_queue, &wait,
> + TASK_UNINTERRUPTIBLE);
> +
> + if (!ata_qc_issue_ok(ap, qc, 1)) {
> + spin_unlock_irq(&ap->host_set->lock);
> + schedule();
> + spin_lock_irq(&ap->host_set->lock);
> + }
> +
> + finish_wait(&ap->cmd_wait_queue, &wait);
> +
> + } while (!ata_qc_issue_ok(ap, qc, 1));
>
> In this bit (above), is it possible for this code to ever
> be invoked from a SCHED_RR or SCHED_FIFO context?
>
> If so, it will lock out all lower-priority processes
> for the duration of the polling interval.

Yeah, I'm not a huge fan of the need for the above code... If every qc
was tied to a SCSI command, we could just ask for a later requeue of the
request as is currently done with NCQ commands. Alternatively, we could
add an internal libata qc queue for postponing these commands. That
would take a little effort, as the sync errors reported by
ata_qc_issue() would then be need to signalled async through the
completion callback instead.

Jeff, what do you think?

--
Jens Axboe

2005-05-27 06:38:57

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Fri, May 27 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >@@ -189,7 +194,7 @@
> > .ioctl = ata_scsi_ioctl,
> > .queuecommand = ata_scsi_queuecmd,
> > .eh_strategy_handler = ata_scsi_error,
> >- .can_queue = ATA_DEF_QUEUE,
> >+ .can_queue = ATA_MAX_QUEUE,
> > .this_id = ATA_SHT_THIS_ID,
> > .sg_tablesize = AHCI_MAX_SG,
> > .max_sectors = ATA_MAX_SECTORS,
> >@@ -200,7 +205,7 @@
> > .dma_boundary = AHCI_DMA_BOUNDARY,
> > .slave_configure = ata_scsi_slave_config,
> > .bios_param = ata_std_bios_param,
> >- .ordered_flush = 1,
> >+ .ordered_flush = 0, /* conflicts with NCQ for now */
> > };
> >
> > static struct ata_port_operations ahci_ops = {
>
> Also, you'll probably want to implement ata_scsi_change_queue_depth, and
> reference it in ahci.c (and other NCQ-capable drivers).

Done!

--
Jens Axboe

2005-05-27 06:58:25

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> Yeah, I'm not a huge fan of the need for the above code... If every qc
> was tied to a SCSI command, we could just ask for a later requeue of the
> request as is currently done with NCQ commands. Alternatively, we could
> add an internal libata qc queue for postponing these commands. That
> would take a little effort, as the sync errors reported by
> ata_qc_issue() would then be need to signalled async through the
> completion callback instead.
>
> Jeff, what do you think?

Just use the SCSI layer for requeueing. That's what I intended.

Every qc that matters can be requeued. Just don't worry about
non-queued, non-fast-path commands. They are typically used in
functions that will immediately notice a failure, and handle it accordingly.

Jeff


2005-05-27 07:18:12

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Fri, May 27 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >Yeah, I'm not a huge fan of the need for the above code... If every qc
> >was tied to a SCSI command, we could just ask for a later requeue of the
> >request as is currently done with NCQ commands. Alternatively, we could
> >add an internal libata qc queue for postponing these commands. That
> >would take a little effort, as the sync errors reported by
> >ata_qc_issue() would then be need to signalled async through the
> >completion callback instead.
> >
> >Jeff, what do you think?
>
> Just use the SCSI layer for requeueing. That's what I intended.

Yep, that is what I'm doing for SCSI originated commands.

> Every qc that matters can be requeued. Just don't worry about
> non-queued, non-fast-path commands. They are typically used in
> functions that will immediately notice a failure, and handle it
> accordingly.

So the current wait-around stuff is ok with you?

--
Jens Axboe

2005-05-27 07:22:41

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, May 26 2005, Jeff Garzik wrote:
> >+ return 0;
> >+}
> >+
> >+/**
> > * ata_bus_probe - Reset and probe ATA bus
> > * @ap: Bus to probe
> > *
> >@@ -2753,6 +2830,16 @@
> > struct ata_port *ap = qc->ap;
> > unsigned int tag, do_clear = 0;
> >
> >+ if (likely(qc->flags & ATA_QCFLAG_ACCOUNT)) {
> >+ if (qc->flags & ATA_QCFLAG_NCQ) {
> >+ assert(ap->ncq_depth);
> >+ ap->ncq_depth--;
> >+ } else {
> >+ assert(ap->depth);
> >+ ap->depth--;
> >+ }
> >+ }
>
> why is this accounting conditional?

I double checked this. If you agree to move the setting of QCFLAG_ACTIVE
_after_ successful ap->ops->qc_issue(qc) and clear it _after_
__ata_qc_complete(qc) then I can just use that bit and kill
ATA_QCFLAG_ACCOUNT.

What do you think?

--
Jens Axboe

2005-05-27 07:32:58

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

author Albert Lee <[email protected]> Fri, 29 Apr 2005 17:34:59 +0800
committer Jeff Garzik <[email protected]> Mon, 16 May 2005 02:46:59 -0400

[PATCH] libata: Prevent the interrupt handler from completing a command twice

Problem:
During the libata CD-ROM stress test, sometimes the "BUG: timeout
without command" error is seen.

Root cause:
Unexpected interrupt occurs after the ata_qc_complete() is called,
but before the SCSI error handler. The interrupt handler is invoked
before the SCSI error handler, and it clears the command by calling
ata_qc_complete() again. Later when the SCSI error handler is run,
the ata_queued_cmd is already gone, causing the "BUG: timeout without
command" error.

Changes:
- Use the ATA_QCFLAG_ACTIVE flag to prevent the interrupt handler
from completing the command twice, before the scsi_error_handler.

Signed-off-by: Albert Lee <[email protected]>


diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -2539,7 +2539,7 @@ static void atapi_request_sense(struct a
ata_sg_init_one(qc, cmd->sense_buffer, sizeof(cmd->sense_buffer));
qc->dma_dir = DMA_FROM_DEVICE;

- memset(&qc->cdb, 0, sizeof(ap->cdb_len));
+ memset(&qc->cdb, 0, ap->cdb_len);
qc->cdb[0] = REQUEST_SENSE;
qc->cdb[4] = SCSI_SENSE_BUFFERSIZE;

@@ -2811,6 +2811,7 @@ void ata_qc_complete(struct ata_queued_c

/* call completion callback */
rc = qc->complete_fn(qc, drv_stat);
+ qc->flags &= ~ATA_QCFLAG_ACTIVE;

/* if callback indicates not to complete command (non-zero),
* return immediately
@@ -3229,7 +3230,8 @@ irqreturn_t ata_interrupt (int irq, void
struct ata_queued_cmd *qc;

qc = ata_qc_from_tag(ap, ap->active_tag);
- if (qc && (!(qc->tf.ctl & ATA_NIEN)))
+ if (qc && (!(qc->tf.ctl & ATA_NIEN)) &&
+ (qc->flags & ATA_QCFLAG_ACTIVE))
handled |= ata_host_intr(ap, qc);
}
}


Attachments:
patch (1.89 kB)

2005-05-27 07:38:42

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Fri, May 27 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >I double checked this. If you agree to move the setting of QCFLAG_ACTIVE
> >_after_ successful ap->ops->qc_issue(qc) and clear it _after_
> >__ata_qc_complete(qc) then I can just use that bit and kill
> >ATA_QCFLAG_ACCOUNT.
> >
> >What do you think?
>
> Fine with me.
>
> Keep in mind that the attached patch was applied recently...

Yeah, the two hunks from the ncq patch would look like this then. Ok?

Index: drivers/scsi/libata-core.c
===================================================================
--- 3ac9a34948049bff79a2b2ce49c0a3c84e35a748/drivers/scsi/libata-core.c (mode:100644)
+++ uncommitted/drivers/scsi/libata-core.c (mode:100644)
@@ -2812,15 +2893,15 @@

/* call completion callback */
rc = qc->complete_fn(qc, drv_stat);
- qc->flags &= ~ATA_QCFLAG_ACTIVE;

/* if callback indicates not to complete command (non-zero),
* return immediately
*/
- if (rc != 0)
+ if (unlikely(rc != 0))
return;

__ata_qc_complete(qc);
+ qc->flags &= ~ATA_QCFLAG_ACTIVE;

VPRINTK("EXIT\n");
}
@@ -2884,12 +3026,25 @@
ap->ops->qc_prep(qc);

qc->ap->active_tag = qc->tag;
+
+ rc = ap->ops->qc_issue(qc);
+ if (rc != ATA_QC_ISSUE_OK)
+ goto err_out;
+
qc->flags |= ATA_QCFLAG_ACTIVE;

- return ap->ops->qc_issue(qc);
+ if (qc->flags & ATA_QCFLAG_NCQ) {
+ assert(ap->ncq_depth < ATA_MAX_QUEUE)
+ ap->ncq_depth++;
+ } else {
+ assert(!ap->depth);
+ ap->depth++;
+ }

+ return ATA_QC_ISSUE_OK;
err_out:
- return -1;
+ ata_qc_free(qc);
+ return rc;
}

/**


--
Jens Axboe

2005-05-27 07:54:02

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> On Fri, May 27 2005, Jeff Garzik wrote:
>
>>Jens Axboe wrote:
>>
>>>I double checked this. If you agree to move the setting of QCFLAG_ACTIVE
>>>_after_ successful ap->ops->qc_issue(qc) and clear it _after_
>>>__ata_qc_complete(qc) then I can just use that bit and kill
>>>ATA_QCFLAG_ACCOUNT.
>>>
>>>What do you think?
>>
>>Fine with me.
>>
>>Keep in mind that the attached patch was applied recently...
>
>
> Yeah, the two hunks from the ncq patch would look like this then. Ok?

ACK (modulo my distaste for 'depth' and 'ncq_depth', of course... :))


2005-05-27 07:59:56

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Fri, May 27 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >On Fri, May 27 2005, Jeff Garzik wrote:
> >
> >>Jens Axboe wrote:
> >>
> >>>I double checked this. If you agree to move the setting of QCFLAG_ACTIVE
> >>>_after_ successful ap->ops->qc_issue(qc) and clear it _after_
> >>>__ata_qc_complete(qc) then I can just use that bit and kill
> >>>ATA_QCFLAG_ACCOUNT.
> >>>
> >>>What do you think?
> >>
> >>Fine with me.
> >>
> >>Keep in mind that the attached patch was applied recently...
> >
> >
> >Yeah, the two hunks from the ncq patch would look like this then. Ok?
>
> ACK (modulo my distaste for 'depth' and 'ncq_depth', of course... :))

Ok, lets get that fixed then... Would you like just a single
ap->queue_depth and a ATA_DFLAG_NCQ_IN_FLIGHT type flag instead?

--
Jens Axboe

2005-05-27 08:23:40

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote:
> Ok, lets get that fixed then... Would you like just a single
> ap->queue_depth and a ATA_DFLAG_NCQ_IN_FLIGHT type flag instead?

works for me

Jeff


2005-05-27 21:40:33

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Hello Jens,

I tried to play with your patch on ICH6R and ICH7R chipset also on
Sil3124R Controller
with 2xSamsung HD160JJ SATAII drives. But the performance gain stay out..
anything special to set to get it working? I used a vanilla-kernel
2.6.12-rc5-git2 for it.

Thanks for assistance/help

Greets
Michael

2005-05-27 22:18:48

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Michael Thonke wrote:
> Hello Jens,
>
> I tried to play with your patch on ICH6R and ICH7R chipset also on
> Sil3124R Controller
> with 2xSamsung HD160JJ SATAII drives. But the performance gain stay out..
> anything special to set to get it working? I used a vanilla-kernel
> 2.6.12-rc5-git2 for it.

SiI 3124 driver needs to be updated to support NCQ.

Jeff



2005-05-27 22:31:14

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jeff Garzik schrieb:

> Michael Thonke wrote:
>
>> Hello Jens,
>>
>> I tried to play with your patch on ICH6R and ICH7R chipset also on
>> Sil3124R Controller
>> with 2xSamsung HD160JJ SATAII drives. But the performance gain stay
>> out..
>> anything special to set to get it working? I used a vanilla-kernel
>> 2.6.12-rc5-git2 for it.
>
>
> SiI 3124 driver needs to be updated to support NCQ.
>
> Jeff
>
>
>
>
Hello Jeff,

thanks for the info, and whats about Intel ICH6R and ICH7R anything
special there?
I played a bit with the patch and tried to tune it a bit. But there is
no Documentation
for AHCI in kernel.with parameter I can handover or can be changed.
Maybe I've missed them?
If so can you please refer me to one?

Thanks in advance and for help

Greets
Michael

2005-05-28 12:12:54

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sat, May 28 2005, Michael Thonke wrote:
> Jeff Garzik schrieb:
>
> > Michael Thonke wrote:
> >
> >> Hello Jens,
> >>
> >> I tried to play with your patch on ICH6R and ICH7R chipset also on
> >> Sil3124R Controller
> >> with 2xSamsung HD160JJ SATAII drives. But the performance gain stay
> >> out..
> >> anything special to set to get it working? I used a vanilla-kernel
> >> 2.6.12-rc5-git2 for it.
> >
> >
> > SiI 3124 driver needs to be updated to support NCQ.
> >
> > Jeff
> >
> >
> >
> >
> Hello Jeff,
>
> thanks for the info, and whats about Intel ICH6R and ICH7R anything
> special there?
> I played a bit with the patch and tried to tune it a bit. But there is
> no Documentation
> for AHCI in kernel.with parameter I can handover or can be changed.
> Maybe I've missed them?
> If so can you please refer me to one?

There's really nothing to be tuned. If NCQ is enabled for your drive, it
will be printed in dmesg after the lba48 flag, such as:

ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq

If you don't see NCQ there, your drive/controller doesn't support it.
Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
/sys/block/sdX/device/queue_depth to see what the configured queueing
depth is for that device.

--
Jens Axboe

2005-05-29 13:01:36

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe wrote,

>
>There's really nothing to be tuned. If NCQ is enabled for your drive, it
>will be printed in dmesg after the lba48 flag, such as:
>
>ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq
>
>If you don't see NCQ there, your drive/controller doesn't support it.
>Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
>/sys/block/sdX/device/queue_depth to see what the configured queueing
>depth is for that device.
>
>
>
Hi Jens,

thanks for the short info now my next question how many queue depths
are healty and wanted?

For my Intel Corporation 82801GR/GH (ICH7 Family) Serial ATA Storage
Controllers cc=AHCI (rev 01)
and Samsung Hd160JJ SATAII drive the default queue is 30

ioGL64NX_MACH~# cat /sys/block/sda/device/{model,queue_depth}
SAMSUNG HD160JJ
30

hdparm -Tt /dev/sda

/dev/sda:
Timing cached reads: 4724 MB in 2.00 seconds = 2360.00 MB/sec
Timing buffered disk reads: 164 MB in 3.02 seconds = 54.28 MB/sec

On random access the drives is a bit noisy but the subjective feeling is
great
everything goes a bit faster.

And whats about the option /sys/block/sdx/device/queue_type = simple
what can be done here?

Thanks in advance
Best regards
Michael



2005-05-29 14:10:13

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

My basic hdparm timing test shouldn't show much of a difference
with NCQ tests, becase hdparm just does a single request at a time,
and waits for the results before issuing another. Now, kernel read-ahead
may result in some command overlap and a slight throughput increase, but..

Something like dbench and/or bonnie++ are more appropriate here.

Cheers

2005-05-29 16:04:08

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Thu, 2005-05-26 at 16:00 +0200, Jens Axboe wrote:
> Now, this patch is not complete. It should work and work well, but error
> handling isn't really tested or finished yet (by any stretch of the
> imagination). So don't use this on a production machine, it _should_ be
> safe to use on your test boxes and for-fun workstations though (I run it
> here...). I have tested on ich6 and ich7 generation ahci, and with
> Maxtor and Seagate drives.

Is this supposed to work on ICH7 in legacy mode as well?

Another question: is there a fundamental problem to have the ICH6/7
enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
don't offer the choice to enable AHCI (like mine :-().

2005-05-29 16:34:46

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> On Thu, 2005-05-26 at 16:00 +0200, Jens Axboe wrote:
>
>>Now, this patch is not complete. It should work and work well, but error
>>handling isn't really tested or finished yet (by any stretch of the
>>imagination). So don't use this on a production machine, it _should_ be
>>safe to use on your test boxes and for-fun workstations though (I run it
>>here...). I have tested on ich6 and ich7 generation ahci, and with
>>Maxtor and Seagate drives.
>
>
> Is this supposed to work on ICH7 in legacy mode as well?

Nope. ata_piix does not support NCQ (because the h/w doesn't support).


> Another question: is there a fundamental problem to have the ICH6/7
> enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
> don't offer the choice to enable AHCI (like mine :-().

Not a problem. You just don't get to use AHCI and such.

Jeff


2005-05-29 16:51:10

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 12:34 -0400, Jeff Garzik wrote:
> >>Now, this patch is not complete. It should work and work well, but error
> >>handling isn't really tested or finished yet (by any stretch of the
> >>imagination). So don't use this on a production machine, it _should_ be
> >>safe to use on your test boxes and for-fun workstations though (I run it
> >>here...). I have tested on ich6 and ich7 generation ahci, and with
> >>Maxtor and Seagate drives.
> > Is this supposed to work on ICH7 in legacy mode as well?
> Nope. ata_piix does not support NCQ (because the h/w doesn't support).

If I understand this correctly: NCQ does not work on ICH7 in native mode
(using ata_piix) because in this mode there is no NCQ available, right?

> > Another question: is there a fundamental problem to have the ICH6/7
> > enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
> > don't offer the choice to enable AHCI (like mine :-().
> Not a problem. You just don't get to use AHCI and such.

Huh?

My question was if there is a fundamental reason why the AHCI mode of
the ICH6/7 must be enabled by the BIOS, is there a reason why the kernel
doesn't do it, or can't do it?

And of course I'd like to have my ICH6/7 in AHCI mode, for obvious
reasons.

2005-05-29 16:57:40

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter schrieb:

>On Thu, 2005-05-26 at 16:00 +0200, Jens Axboe wrote:
>
>
>>Now, this patch is not complete. It should work and work well, but error
>>handling isn't really tested or finished yet (by any stretch of the
>>imagination). So don't use this on a production machine, it _should_ be
>>safe to use on your test boxes and for-fun workstations though (I run it
>>here...). I have tested on ich6 and ich7 generation ahci, and with
>>Maxtor and Seagate drives.
>>
>>
>
>Is this supposed to work on ICH7 in legacy mode as well?
>
>Another question: is there a fundamental problem to have the ICH6/7
>enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
>don't offer the choice to enable AHCI (like mine :-().
>-
>To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>the body of a message to [email protected]
>More majordomo info at http://vger.kernel.org/majordomo-info.html
>Please read the FAQ at http://www.tux.org/lkml/
>
>
>
Hello,

here on my Mainboard ASUS P5WAD2-Premium (i955X/ICH7) I need to enable
AHCI support
like on the ICH6R based ASUS P5AD2-E xx the same here.
What I have noticed is when I enable Intel Raid support AHCI is on by
default, maybe it used AHCI if
drive is capable?! There are also performance regressions from within
used bios.
What mainboard are you using Erik? All mainboards with ICH6R I saw and
had (925X,925XE) offer options to enable AHCI.

Greets and best regards

Michael

2005-05-29 16:59:52

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> On Sun, 2005-05-29 at 12:34 -0400, Jeff Garzik wrote:
>
>>>>Now, this patch is not complete. It should work and work well, but error
>>>>handling isn't really tested or finished yet (by any stretch of the
>>>>imagination). So don't use this on a production machine, it _should_ be
>>>>safe to use on your test boxes and for-fun workstations though (I run it
>>>>here...). I have tested on ich6 and ich7 generation ahci, and with
>>>>Maxtor and Seagate drives.
>>>
>>>Is this supposed to work on ICH7 in legacy mode as well?
>>
>>Nope. ata_piix does not support NCQ (because the h/w doesn't support).
>
>
> If I understand this correctly: NCQ does not work on ICH7 in native mode
> (using ata_piix) because in this mode there is no NCQ available, right?

To be more specific, there are these modes:

legacy mode no NCQ
combined mode no NCQ
native mode no NCQ
AHCI mode NCQ


>>>Another question: is there a fundamental problem to have the ICH6/7
>>>enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
>>>don't offer the choice to enable AHCI (like mine :-().
>>
>>Not a problem. You just don't get to use AHCI and such.
>
>
> Huh?
>
> My question was if there is a fundamental reason why the AHCI mode of
> the ICH6/7 must be enabled by the BIOS, is there a reason why the kernel
> doesn't do it, or can't do it?

The BIOS sets up PCI resources necessary to use AHCI mode.

Jeff


2005-05-29 17:24:11

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 12:59 -0400, Jeff Garzik wrote:

> >>Nope. ata_piix does not support NCQ (because the h/w doesn't support).
> > If I understand this correctly: NCQ does not work on ICH7 in native mode
> > (using ata_piix) because in this mode there is no NCQ available, right?
>
> To be more specific, there are these modes:
>
> legacy mode no NCQ
> combined mode no NCQ
> native mode no NCQ
> AHCI mode NCQ

Thx.

> > My question was if there is a fundamental reason why the AHCI mode of
> > the ICH6/7 must be enabled by the BIOS, is there a reason why the kernel
> > doesn't do it, or can't do it?
>
> The BIOS sets up PCI resources necessary to use AHCI mode.

Ok. So there's absolutely no way to do that afterwards? It'd really be a
pity :-(

On the same subject: is there a reason why ICH6 gets "BAR0-3 ignored"
and always gets the legacy i/o ports and IRQ's assigned? I'd say there
is absolutely no need to be compatible in this way, the PCI code can
assign the IRQ and I/O ports as with any other PCI device?

2005-05-29 17:27:14

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 18:57 +0200, Michael Thonke wrote:

> >Another question: is there a fundamental problem to have the ICH6/7
> >enabled AHCI mode by the kernel instead of the BIOS? I know some BIOSes
> >don't offer the choice to enable AHCI (like mine :-().

> here on my Mainboard ASUS P5WAD2-Premium (i955X/ICH7) I need to enable
> AHCI support like on the ICH6R based ASUS P5AD2-E xx the same here.
> What I have noticed is when I enable Intel Raid support AHCI is on by
> default, maybe it used AHCI if drive is capable?!

AHCI is needed for "raid"-support.

> What mainboard are you using Erik? All mainboards with ICH6R I saw and
> had (925X,925XE) offer options to enable AHCI.

ICH6M (mobile/no raid) on a Dell Inspiron 9300 laptop. AFAIK there are
no plans to implement support for AHCI transition in the BIOS. &^$##($%
DELL.

2005-05-29 17:30:15

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29, 2005 at 07:23:52PM +0200, Erik Slagter wrote:
> On Sun, 2005-05-29 at 12:59 -0400, Jeff Garzik wrote:
> > > My question was if there is a fundamental reason why the AHCI mode of
> > > the ICH6/7 must be enabled by the BIOS, is there a reason why the kernel
> > > doesn't do it, or can't do it?
> >
> > The BIOS sets up PCI resources necessary to use AHCI mode.
>
> Ok. So there's absolutely no way to do that afterwards? It'd really be a
> pity :-(

It is technically possible. BIOS is just software, just like the OS.

It's just a huge pain in the butt, because the kernel might accidentally
stomp on some resources the BIOS secretly set up, or somesuch.


> On the same subject: is there a reason why ICH6 gets "BAR0-3 ignored"
> and always gets the legacy i/o ports and IRQ's assigned? I'd say there
> is absolutely no need to be compatible in this way, the PCI code can
> assign the IRQ and I/O ports as with any other PCI device?

IDE is special.

This is due to how the BIOS sets up an IDE PCI device in legacy mode.
BAR0-3 are set to zero, which is a signal to the OS that the IDE PCI
device is in legacy mode (io 0x1f0+0x170, irq 14+15). Since the IDE I/O
ports are in ISA space not PCI space, the PCI BARs reflect nothing.

Jeff



2005-05-29 17:46:13

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 13:29 -0400, Jeff Garzik wrote:
> On Sun, May 29, 2005 at 07:23:52PM +0200, Erik Slagter wrote:
> > On Sun, 2005-05-29 at 12:59 -0400, Jeff Garzik wrote:
> > > > My question was if there is a fundamental reason why the AHCI mode of
> > > > the ICH6/7 must be enabled by the BIOS, is there a reason why the kernel
> > > > doesn't do it, or can't do it?
> > >
> > > The BIOS sets up PCI resources necessary to use AHCI mode.
> >
> > Ok. So there's absolutely no way to do that afterwards? It'd really be a
> > pity :-(
>
> It is technically possible. BIOS is just software, just like the OS.
>
> It's just a huge pain in the butt, because the kernel might accidentally
> stomp on some resources the BIOS secretly set up, or somesuch.

Ah, ok. I wasn't aware of this large role of the BIOS in setting up
hardware, nowadays. Grmbl.

> > On the same subject: is there a reason why ICH6 gets "BAR0-3 ignored"
> > and always gets the legacy i/o ports and IRQ's assigned? I'd say there
> > is absolutely no need to be compatible in this way, the PCI code can
> > assign the IRQ and I/O ports as with any other PCI device?
>
> IDE is special.
>
> This is due to how the BIOS sets up an IDE PCI device in legacy mode.
> BAR0-3 are set to zero, which is a signal to the OS that the IDE PCI
> device is in legacy mode (io 0x1f0+0x170, irq 14+15). Since the IDE I/O
> ports are in ISA space not PCI space, the PCI BARs reflect nothing.

Goodie. So we will be stuck with MS/DOS compatibility until somewhere in
the next millenium.

I guess the only way to have, for example the ICH6, not using legacy
IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
implemented).

Thrilling.

2005-05-29 18:01:25

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> I guess the only way to have, for example the ICH6, not using legacy
> IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
> implemented).

"native mode" is where PATA and/or SATA PCI device is programmed into
full PCI mode -- PCI BARs, PCI irq, etc. Some BIOSen allow you to
enable mode, which disables all use of legacy IRQ/ports.

Also, ICH6 silicon does not support AHCI, only ICH6-R and ICH6-M.

Jeff


2005-05-29 18:11:13

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe schrieb:

>There's really nothing to be tuned. If NCQ is enabled for your drive, it
>will be printed in dmesg after the lba48 flag, such as:
>
>ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq
>
>If you don't see NCQ there, your drive/controller doesn't support it.
>Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
>/sys/block/sdX/device/queue_depth to see what the configured queueing
>depth is for that device.
>
>
>
Hello again,

the queue_depth of 30 is okay? On boot the CFQ scheduler tells:

cfq: depth 4 reached, tagging now on

This only appears with AHCI enabled what does that mean?

Also a question which options can be set in queue_type?

Best regard and thanks for help

Greets
Michael


2005-05-29 18:15:39

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29, 2005 at 08:10:51PM +0200, Erik Slagter wrote:
> On Sun, 2005-05-29 at 14:01 -0400, Jeff Garzik wrote:
> > Erik Slagter wrote:
> > > I guess the only way to have, for example the ICH6, not using legacy
> > > IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
> > > implemented).
> >
> > "native mode" is where PATA and/or SATA PCI device is programmed into
> > full PCI mode -- PCI BARs, PCI irq, etc. Some BIOSen allow you to
> > enable mode, which disables all use of legacy IRQ/ports.
> >
> > Also, ICH6 silicon does not support AHCI, only ICH6-R and ICH6-M.
>
> So, I do have this laptop with ICH6-M, is there a way to get it in
> "native" mode without having to enable AHCI mode (can't do that, the
> BIOS doesn't allow it)?

Same as enabling AHCI mode -- really need a BIOS option to do that.


> I still think still using IRQ14/15 etc. is
> silly ;-)

<shrug> IRQ14/15 are dedicated to IDE, whereas PCI irqs might be shared.
That can sometimes be faster, and/or lead to fewer problems.

Jeff



2005-05-29 18:14:49

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 14:01 -0400, Jeff Garzik wrote:
> Erik Slagter wrote:
> > I guess the only way to have, for example the ICH6, not using legacy
> > IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
> > implemented).
>
> "native mode" is where PATA and/or SATA PCI device is programmed into
> full PCI mode -- PCI BARs, PCI irq, etc. Some BIOSen allow you to
> enable mode, which disables all use of legacy IRQ/ports.
>
> Also, ICH6 silicon does not support AHCI, only ICH6-R and ICH6-M.

So, I do have this laptop with ICH6-M, is there a way to get it in
"native" mode without having to enable AHCI mode (can't do that, the
BIOS doesn't allow it)? I still think still using IRQ14/15 etc. is
silly ;-)

2005-05-29 18:27:28

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jeff Garzik schrieb:

> Erik Slagter wrote:
>
>> I guess the only way to have, for example the ICH6, not using legacy
>> IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
>> implemented).
>
>
> "native mode" is where PATA and/or SATA PCI device is programmed into
> full PCI mode -- PCI BARs, PCI irq, etc. Some BIOSen allow you to
> enable mode, which disables all use of legacy IRQ/ports.
>
> Also, ICH6 silicon does not support AHCI, only ICH6-R and ICH6-M.
>
> Jeff
>
Hello Jeff,

AHCI also works when the so called option "RAID mode" on ICH6R/ICH7 is set.

One question in addition when will the patch of Jens in libata-2.6
tree?or is it already merged?


/Michael

2005-05-29 18:31:46

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29, 2005 at 08:27:03PM +0200, Michael Thonke wrote:
> Jeff Garzik schrieb:
>
> > Erik Slagter wrote:
> >
> >> I guess the only way to have, for example the ICH6, not using legacy
> >> IRQ/ports, is to switch it to AHCI, which only the BIOS can do (if
> >> implemented).
> >
> >
> > "native mode" is where PATA and/or SATA PCI device is programmed into
> > full PCI mode -- PCI BARs, PCI irq, etc. Some BIOSen allow you to
> > enable mode, which disables all use of legacy IRQ/ports.
> >
> > Also, ICH6 silicon does not support AHCI, only ICH6-R and ICH6-M.
> >
> > Jeff
> >
> Hello Jeff,
>
> AHCI also works when the so called option "RAID mode" on ICH6R/ICH7 is set.
>
> One question in addition when will the patch of Jens in libata-2.6
> tree?or is it already merged?

It is merged on the 'ncq' branch of libata-dev.git tree at
rsync://rsync.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev.git

If you need an intro to git, read http://lkml.org/lkml/2005/5/26/11

Jeff




2005-05-29 19:02:51

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Michael Thonke wrote:
> Jens Axboe wrote,
>
> >
> >There's really nothing to be tuned. If NCQ is enabled for your drive, it
> >will be printed in dmesg after the lba48 flag, such as:
> >
> >ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq
> >
> >If you don't see NCQ there, your drive/controller doesn't support it.
> >Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
> >/sys/block/sdX/device/queue_depth to see what the configured queueing
> >depth is for that device.
> >
> >
> >
> Hi Jens,
>
> thanks for the short info now my next question how many queue depths
> are healty and wanted?
>
> For my Intel Corporation 82801GR/GH (ICH7 Family) Serial ATA Storage
> Controllers cc=AHCI (rev 01)
> and Samsung Hd160JJ SATAII drive the default queue is 30
>
> ioGL64NX_MACH~# cat /sys/block/sda/device/{model,queue_depth}
> SAMSUNG HD160JJ
> 30
>
> hdparm -Tt /dev/sda
>
> /dev/sda:
> Timing cached reads: 4724 MB in 2.00 seconds = 2360.00 MB/sec
> Timing buffered disk reads: 164 MB in 3.02 seconds = 54.28 MB/sec
>
> On random access the drives is a bit noisy but the subjective feeling
> is great everything goes a bit faster.

You should see a nice performance improvement on random reads mainly,
with streamed threaded reads being a bit faster as well. Write
performance will be the same, if you had write back caching on before.
So the real win is random reads, and that can be a pretty big win.

Actually I would say that the drive should sound _less_ noisy if NCQ is
being really effective. Hard to judge of course, very subjective :)

> And whats about the option /sys/block/sdx/device/queue_type = simple
> what can be done here?

Nothing, unfortunately NCQ doesn't provided any way of doing ordered
tags. The only tunable is the queue_depth, you can set that anywhere
between 1 and 30.

--
Jens Axboe

2005-05-29 19:04:23

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Mark Lord wrote:
> My basic hdparm timing test shouldn't show much of a difference
> with NCQ tests, becase hdparm just does a single request at a time,
> and waits for the results before issuing another. Now, kernel read-ahead
> may result in some command overlap and a slight throughput increase, but..
>
> Something like dbench and/or bonnie++ are more appropriate here.

I don't like bonnie++ very much and dbench is very write intensive. I
would suggest trying tiotest, find it on sf.net. It gets easily readable
results and they are usually fairly consistent across runs if you limit
the RAM to something sensible (eg 256MB and using a data set size of
768MB).

--
Jens Axboe

2005-05-29 19:05:48

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29, 2005 at 09:04:21PM +0200, Jens Axboe wrote:
> On Sun, May 29 2005, Mark Lord wrote:
> > My basic hdparm timing test shouldn't show much of a difference
> > with NCQ tests, becase hdparm just does a single request at a time,
> > and waits for the results before issuing another. Now, kernel read-ahead
> > may result in some command overlap and a slight throughput increase, but..
> >
> > Something like dbench and/or bonnie++ are more appropriate here.
>
> I don't like bonnie++ very much and dbench is very write intensive. I
> would suggest trying tiotest, find it on sf.net. It gets easily readable
> results and they are usually fairly consistent across runs if you limit
> the RAM to something sensible (eg 256MB and using a data set size of
> 768MB).

As an FYI... download Stephen Tweedie's verify-data tool at
http://people.redhat.com/sct/src/verify-data/

Robin Miller's 'dt' is also nice to have.

Jeff



2005-05-29 19:08:06

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Michael Thonke wrote:
> Jens Axboe schrieb:
>
> >There's really nothing to be tuned. If NCQ is enabled for your drive, it
> >will be printed in dmesg after the lba48 flag, such as:
> >
> >ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq
> >
> >If you don't see NCQ there, your drive/controller doesn't support it.
> >Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
> >/sys/block/sdX/device/queue_depth to see what the configured queueing
> >depth is for that device.
> >
> >
> >
> Hello again,
>
> the queue_depth of 30 is okay? On boot the CFQ scheduler tells:

By default, the maximum depth is used. For desktop use, a depth of 2-4
is probably more appropriate until the io schedulers become a little
more intelligent wrt queueing.

> cfq: depth 4 reached, tagging now on
>
> This only appears with AHCI enabled what does that mean?

It only appears if AHCI is enabled, because tagged command queueing is
only working/enabled on AHCI. The message is informational only, CFQ
tells you that it has detected queueing hardware (driver maintains a
depth of >= 4).

> Also a question which options can be set in queue_type?

Nothing.

--
Jens Axboe

2005-05-29 19:22:02

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Jeff Garzik wrote:
> On Sun, May 29, 2005 at 09:04:21PM +0200, Jens Axboe wrote:
> > On Sun, May 29 2005, Mark Lord wrote:
> > > My basic hdparm timing test shouldn't show much of a difference
> > > with NCQ tests, becase hdparm just does a single request at a time,
> > > and waits for the results before issuing another. Now, kernel read-ahead
> > > may result in some command overlap and a slight throughput increase, but..
> > >
> > > Something like dbench and/or bonnie++ are more appropriate here.
> >
> > I don't like bonnie++ very much and dbench is very write intensive. I
> > would suggest trying tiotest, find it on sf.net. It gets easily readable
> > results and they are usually fairly consistent across runs if you limit
> > the RAM to something sensible (eg 256MB and using a data set size of
> > 768MB).
>
> As an FYI... download Stephen Tweedie's verify-data tool at
> http://people.redhat.com/sct/src/verify-data/

Interesting, will try it tomorrow.

> Robin Miller's 'dt' is also nice to have.

Yep, have tried that in the past. I'm just recommending tiotest as an
easy and good way for people to test performance quickly. Just boot with
256MB and use eg tiobench.pl --threads 8 should be a good way to test
NCQ.

--
Jens Axboe

2005-05-29 20:12:56

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jens Axboe schrieb:

>On Sun, May 29 2005, Michael Thonke wrote:
>
>
>>Jens Axboe wrote,
>>
>>
>>
>>>There's really nothing to be tuned. If NCQ is enabled for your drive, it
>>>will be printed in dmesg after the lba48 flag, such as:
>>>
>>>ata1: dev 0 ATA, max UDMA/133, 488281250 sectors lba48 ncq
>>>
>>>If you don't see NCQ there, your drive/controller doesn't support it.
>>>Likewise you will have a queueing depth of > 1 if NCQ is enabled, check
>>>/sys/block/sdX/device/queue_depth to see what the configured queueing
>>>depth is for that device.
>>>
>>>
>>>
>>>
>>>
>>Hi Jens,
>>
>>thanks for the short info now my next question how many queue depths
>>are healty and wanted?
>>
>>For my Intel Corporation 82801GR/GH (ICH7 Family) Serial ATA Storage
>>Controllers cc=AHCI (rev 01)
>>and Samsung Hd160JJ SATAII drive the default queue is 30
>>
>> ioGL64NX_MACH~# cat /sys/block/sda/device/{model,queue_depth}
>> SAMSUNG HD160JJ
>> 30
>>
>> hdparm -Tt /dev/sda
>>
>> /dev/sda:
>> Timing cached reads: 4724 MB in 2.00 seconds = 2360.00 MB/sec
>> Timing buffered disk reads: 164 MB in 3.02 seconds = 54.28 MB/sec
>>
>>On random access the drives is a bit noisy but the subjective feeling
>>is great everything goes a bit faster.
>>
>>
>
>You should see a nice performance improvement on random reads mainly,
>with streamed threaded reads being a bit faster as well. Write
>performance will be the same, if you had write back caching on before.
>So the real win is random reads, and that can be a pretty big win.
>
>Actually I would say that the drive should sound _less_ noisy if NCQ is
>being really effective. Hard to judge of course, very subjective :)
>
>
>
Well the subjective feeling is great through! What I noticed is a
improvement on rsync (goes slighty faster drive to drive).
The noise decrease now it only make noise on very heavy IO reads and writes.

>>And whats about the option /sys/block/sdx/device/queue_type = simple
>>what can be done here?
>>
>>
>
>Nothing, unfortunately NCQ doesn't provided any way of doing ordered
>tags. The only tunable is the queue_depth, you can set that anywhere
>between 1 and 30.
>
>
>
So way my drive got default 30 as queue_depth on AHCI as you mentoined
in the next mail
2-4 should be suitable and enough/normal?


Thanks for the informations

Greets
Michael

2005-05-29 20:18:00

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Michael Thonke wrote:
> Well the subjective feeling is great through! What I noticed is a
> improvement on rsync (goes slighty faster drive to drive).
> The noise decrease now it only make noise on very heavy IO reads and writes.


It might be as simple as... NCQ means the drive is doing more work.
More work means more noise.

Jeff


2005-05-29 21:50:10

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Hello,

sometimes when I boot the kernel is unable to get the correct geometrie
of my Samsung drives.

Can only provide this message the kernel prints:

getblk();invalid blocksize 166869779 hardsect: 512

That's everthing I've got.

When it boot correctly I get this for the drives:

SCSI device sda: 312581808 512-byte hdwr sectors (160042 MB)
SCSI device sda: drive cache: write back
SCSI device sda: 312581808 512-byte hdwr sectors (160042 MB)
SCSI device sda: drive cache: write back
sda: sda1 sda2 < sda5 >
Attached scsi disk sda at scsi0, channel 0, id 0, lun 0
SCSI device sdb: 312581808 512-byte hdwr sectors (160042 MB)
Losing some ticks... checking if CPU frequency changed.
SCSI device sdb: drive cache: write back
SCSI device sdb: 312581808 512-byte hdwr sectors (160042 MB)
SCSI device sdb: drive cache: write back
sdb: sdb1 sdb2 < sdb5 sdb6 >

If you need further infos let me now.

Greets
Michael

2005-05-30 00:06:19

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
>
> ICH6M (mobile/no raid) on a Dell Inspiron 9300 laptop. AFAIK there are
> no plans to implement support for AHCI transition in the BIOS. &^$##($%
> DELL.

No hope of it on this machine (I'm using a tricked-out i9300 here too),
because (1) the HD is PATA, not SATA, and (2) the drive itself probably
doesn't support NCQ (my 100GB drive does NOT -- use "hdparm -I" to see
what is supported on any given drive. libata-dev includes hdparm support).

Cheers

2005-05-30 06:05:11

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Jeff Garzik wrote:
> Michael Thonke wrote:
> >Well the subjective feeling is great through! What I noticed is a
> >improvement on rsync (goes slighty faster drive to drive).
> >The noise decrease now it only make noise on very heavy IO reads and
> >writes.
>
>
> It might be as simple as... NCQ means the drive is doing more work.
> More work means more noise.

You can argue both ways :-). Usually more noise means more seeks and
thus less work done.

--
Jens Axboe

2005-05-30 06:13:19

by Jens Axboe

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, May 29 2005, Michael Thonke wrote:
> >Nothing, unfortunately NCQ doesn't provided any way of doing ordered
> >tags. The only tunable is the queue_depth, you can set that anywhere
> >between 1 and 30.
> >
> >
> >
> So way my drive got default 30 as queue_depth on AHCI as you mentoined
> in the next mail
> 2-4 should be suitable and enough/normal?

Yes, 30 will be default for basically anyone as ahci now advertises a
max depth of 30 and the drives typically all support a depth of 32. The
minimum value of those two is used.

Whether 2-4 is 'enough' depends on your workload. But usually 2-4 is
enough to get a nice speedup on heavier io, while still getting you good
latencies on the individual io.

--
Jens Axboe

2005-05-30 07:30:10

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Sun, 2005-05-29 at 20:06 -0400, Mark Lord wrote:
> > ICH6M (mobile/no raid) on a Dell Inspiron 9300 laptop. AFAIK there are
> > no plans to implement support for AHCI transition in the BIOS. &^$##($%
> > DELL.
>
> No hope of it on this machine (I'm using a tricked-out i9300 here too),
> because (1) the HD is PATA, not SATA, and (2) the drive itself probably
> doesn't support NCQ (my 100GB drive does NOT -- use "hdparm -I" to see
> what is supported on any given drive. libata-dev includes hdparm support).

I really have a (native) SATA drive, I checked the ID from dmesg.

2005-05-30 18:10:38

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> On Sun, 2005-05-29 at 20:06 -0400, Mark Lord wrote:
>
>>>ICH6M (mobile/no raid) on a Dell Inspiron 9300 laptop. AFAIK there are
>>>no plans to implement support for AHCI transition in the BIOS. &^$##($%
>>>DELL.
> I really have a (native) SATA drive, I checked the ID from dmesg.

Seems rather unlikely that they would plumb the same notebook both ways.
The 100GB drive in the i9300 here is a "FUJITSU MHV2100AH" (PATA).

Cheers

2005-05-30 18:24:53

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Mon, 2005-05-30 at 14:09 -0400, Mark Lord wrote:
> >>>ICH6M (mobile/no raid) on a Dell Inspiron 9300 laptop. AFAIK there are
> >>>no plans to implement support for AHCI transition in the BIOS. &^$##($%
> >>>DELL.
> > I really have a (native) SATA drive, I checked the ID from dmesg.
>
> Seems rather unlikely that they would plumb the same notebook both ways.
> The 100GB drive in the i9300 here is a "FUJITSU MHV2100AH" (PATA).

This one is a MHT2080A and it looks indeed it's not SATA, so no NCQ.
Still I'd like to run in ACHI mode ;-)

I must have been fooled by the FC3 setup disk that handed it libata, I
didn't know libata also handles pata, then.

2005-05-30 18:32:07

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:>
> Still I'd like to run in ACHI mode ;-)

Me too! But from reading the ICH6 Intel docs,
it seems that AHCI mode is only for true SATA drives.

Or perhaps I've mis-read that part.

Cheers

2005-05-30 18:37:20

by Michael Thonke

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Mark Lord schrieb:

> Erik Slagter wrote:>
>
>> Still I'd like to run in ACHI mode ;-)
>
>
> Me too! But from reading the ICH6 Intel docs,
> it seems that AHCI mode is only for true SATA drives.
>
> Or perhaps I've mis-read that part.
>
> Cheers
>
Well your're right NCQ i only a feature of SATA drives (native ones) not
like the SATA drives
that uses Marvell Brigde chip (orig. PATA) like Samsung SP80 SPxxxx series.
The new ones are real SATA drives. But some PATA devices and controllers
support TCQ.
There're surviral docs about this..also Jeff Garzik mentoined it on his
libata doc site.


Greets and best regards

Michael

2005-05-30 18:48:42

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Mark Lord wrote:
> Erik Slagter wrote:>
>
>> Still I'd like to run in ACHI mode ;-)
>
>
> Me too! But from reading the ICH6 Intel docs,
> it seems that AHCI mode is only for true SATA drives.

Correct. AHCI mode absolutely requires SATA, because it only supports
the native SATA "FIS" packets.

As much as I would like, one cannot use AHCI to talk to any PATA device[1].

Jeff



[1] unless it's PATA bridged over SATA, the exception case that turns
any PATA device into a SATA device.

2005-05-30 18:52:37

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Michael Thonke wrote:
> The new ones are real SATA drives. But some PATA devices and controllers
> support TCQ.

TCQ is a poorly designed feature, and so, will only be supported on a
very few, special host controllers.

For most people, even if they down a drive with legacy TCQ, queueing
will -never- be supported. For more details read
http://linux.yyz.us/sata/software-status.html#tcq

Jeff


2005-05-30 18:54:42

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> I must have been fooled by the FC3 setup disk that handed it libata, I
> didn't know libata also handles pata, then.

libata software supports PATA, but no distribution ships with libata
PATA support enabled (nor should they!).

There are a few unusual cases with combined mode where libata will
support PATA, but those are rare.

Jeff


2005-05-30 20:04:09

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Mon, 2005-05-30 at 14:50 -0400, Jeff Garzik wrote:
> Erik Slagter wrote:
> > I must have been fooled by the FC3 setup disk that handed it libata, I
> > didn't know libata also handles pata, then.
>
> libata software supports PATA, but no distribution ships with libata
> PATA support enabled (nor should they!).

In that case FC3 is not a distribution?!

> There are a few unusual cases with combined mode where libata will
> support PATA, but those are rare.

ich6/piix :-)

2005-05-30 20:19:51

by Jeff Garzik

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Erik Slagter wrote:
> On Mon, 2005-05-30 at 14:50 -0400, Jeff Garzik wrote:
>
>>Erik Slagter wrote:
>>
>>>I must have been fooled by the FC3 setup disk that handed it libata, I
>>>didn't know libata also handles pata, then.
>>
>>libata software supports PATA, but no distribution ships with libata
>>PATA support enabled (nor should they!).
>
>
> In that case FC3 is not a distribution?!
>
>
>>There are a few unusual cases with combined mode where libata will
>>support PATA, but those are rare.
>
>
> ich6/piix :-)

Under PATA your devices should be showing up at /dev/hdX.

If this is not the case, please report a bug.

Jeff



2005-05-30 23:18:49

by Mark Lord

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

Jeff Garzik wrote:
> libata software supports PATA, but no distribution ships with libata
> PATA support enabled (nor should they!).

(K)Ubuntu does, and it works very well, thanks.

Cheers

2005-05-31 07:45:12

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Mon, 2005-05-30 at 16:19 -0400, Jeff Garzik wrote:
> >>>I must have been fooled by the FC3 setup disk that handed it libata, I
> >>>didn't know libata also handles pata, then.
> >>libata software supports PATA, but no distribution ships with libata
> >>PATA support enabled (nor should they!).
> > In that case FC3 is not a distribution?!
> >>There are a few unusual cases with combined mode where libata will
> >>support PATA, but those are rare.
> > ich6/piix :-)
>
> Under PATA your devices should be showing up at /dev/hdX.
> If this is not the case, please report a bug.

Okay, just to be sure before I file an official bug report:

kernel 2.6.12-rc5-git4
libata enabled (with atapi enabled, but that doesn't influence the
"harddisk" behaviour)
all ide stuff disabled
ICH6-M
Fujitsu MHT2080A harddisk (apparently PATA)
Disk shows up as /dev/sda

I don't mind, though.

2005-05-31 07:49:08

by Erik Slagter

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Mon, 2005-05-30 at 19:14 -0400, Mark Lord wrote:
> > libata software supports PATA, but no distribution ships with libata
> > PATA support enabled (nor should they!).
>
> (K)Ubuntu does, and it works very well, thanks.

So that's two... Does it also show your (pata) harddisk as /dev/sda?

2005-05-31 08:08:45

by Patrick McFarland

[permalink] [raw]
Subject: Re: Playing with SATA NCQ

On Tuesday 31 May 2005 03:48 am, Erik Slagter wrote:
> On Mon, 2005-05-30 at 19:14 -0400, Mark Lord wrote:
> > > libata software supports PATA, but no distribution ships with libata
> > > PATA support enabled (nor should they!).
> >
> > (K)Ubuntu does, and it works very well, thanks.
>
> So that's two... Does it also show your (pata) harddisk as /dev/sda?

Ubuntu and Kubuntu are both the same distro, thankyouverymuch. I'm running
Ubuntu, and it does _not_ show pata drives as /dev/s*, only has /dev/h*.

--
Patrick "Diablo-D3" McFarland || [email protected]
"Computer games don't affect kids; I mean if Pac-Man affected us as kids, we'd
all be running around in darkened rooms, munching magic pills and listening to
repetitive electronic music." -- Kristian Wilson, Nintendo, Inc, 1989


Attachments:
(No filename) (799.00 B)
(No filename) (189.00 B)
Download all attachments