I finally got ATAPI working on my x86-64 AHCI box.
Just checked the following changes into the 'upstream-fixes' branch of
rsync://rsync.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
Will send upstream soon.
drivers/scsi/ahci.c | 24 +++++++++++---
drivers/scsi/libata-core.c | 64 ++++++++++++++++---------------------
drivers/scsi/libata-scsi.c | 77 +++++++++++++++++++++++++--------------------
drivers/scsi/libata.h | 2 -
include/linux/libata.h | 14 ++++++++
5 files changed, 105 insertions(+), 76 deletions(-)
Jeff Garzik:
[libata ahci] error handling fixes
[libata] fix bugs in ATAPI padding DMA mapping code
[libata] minor fixes, new helpers
[libata] REQUEST SENSE handling fixes
diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 4e96ec5..c710a7d 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -603,7 +603,12 @@ static void ahci_intr_error(struct ata_p
writel(tmp, port_mmio + PORT_CMD);
readl(port_mmio + PORT_CMD); /* flush */
- printk(KERN_WARNING "ata%u: error occurred, port reset\n", ap->id);
+ printk(KERN_WARNING "ata%u: error occurred, port reset (%s%s%s%s)\n",
+ ap->id,
+ irq_stat & PORT_IRQ_TF_ERR ? "taskf " : "",
+ irq_stat & PORT_IRQ_HBUS_ERR ? "hbus " : "",
+ irq_stat & PORT_IRQ_HBUS_DATA_ERR ? "hbus_data " : "",
+ irq_stat & PORT_IRQ_IF_ERR ? "if " : "");
}
static void ahci_eng_timeout(struct ata_port *ap)
@@ -618,13 +623,13 @@ static void ahci_eng_timeout(struct ata_
spin_lock_irqsave(&host_set->lock, flags);
- ahci_intr_error(ap, readl(port_mmio + PORT_IRQ_STAT));
-
qc = ata_qc_from_tag(ap, ap->active_tag);
if (!qc) {
printk(KERN_ERR "ata%u: BUG: timeout without command\n",
ap->id);
} else {
+ ahci_intr_error(ap, readl(port_mmio + PORT_IRQ_STAT));
+
/* hack alert! We cannot use the supplied completion
* function from inside the ->eh_strategy_handler() thread.
* libata is the only user of ->eh_strategy_handler() in
@@ -659,9 +664,18 @@ static inline int ahci_host_intr(struct
}
if (status & PORT_IRQ_FATAL) {
- ahci_intr_error(ap, status);
+ unsigned int err_mask;
+ if (status & PORT_IRQ_TF_ERR)
+ err_mask = AC_ERR_DEV;
+ else if (status & PORT_IRQ_IF_ERR)
+ err_mask = AC_ERR_ATA_BUS;
+ else
+ err_mask = AC_ERR_HOST_BUS;
+
+ if (err_mask != AC_ERR_DEV)
+ ahci_intr_error(ap, status);
if (qc)
- ata_qc_complete(qc, AC_ERR_OTHER);
+ ata_qc_complete(qc, err_mask);
}
return 1;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index d81db3a..ba1eb8b 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -1263,7 +1263,7 @@ retry:
}
/* ATAPI-specific feature tests */
- else {
+ else if (dev->class == ATA_DEV_ATAPI) {
if (ata_id_is_ata(dev->id)) /* sanity check */
goto err_out_nosup;
@@ -2399,7 +2399,7 @@ static void ata_sg_clean(struct ata_queu
if (qc->flags & ATA_QCFLAG_SINGLE)
assert(qc->n_elem == 1);
- DPRINTK("unmapping %u sg elements\n", qc->n_elem);
+ VPRINTK("unmapping %u sg elements\n", qc->n_elem);
/* if we padded the buffer out to 32-bit bound, and data
* xfer direction is from-device, we must copy from the
@@ -2409,7 +2409,8 @@ static void ata_sg_clean(struct ata_queu
pad_buf = ap->pad + (qc->tag * ATA_DMA_PAD_SZ);
if (qc->flags & ATA_QCFLAG_SG) {
- dma_unmap_sg(ap->host_set->dev, sg, qc->n_elem, dir);
+ if (qc->n_elem)
+ dma_unmap_sg(ap->host_set->dev, sg, qc->n_elem, dir);
/* restore last sg */
sg[qc->orig_n_elem - 1].length += qc->pad_len;
if (pad_buf) {
@@ -2419,8 +2420,10 @@ static void ata_sg_clean(struct ata_queu
kunmap_atomic(psg->page, KM_IRQ0);
}
} else {
- dma_unmap_single(ap->host_set->dev, sg_dma_address(&sg[0]),
- sg_dma_len(&sg[0]), dir);
+ if (sg_dma_len(&sg[0]) > 0)
+ dma_unmap_single(ap->host_set->dev,
+ sg_dma_address(&sg[0]), sg_dma_len(&sg[0]),
+ dir);
/* restore sg */
sg->length += qc->pad_len;
if (pad_buf)
@@ -2619,6 +2622,11 @@ static int ata_sg_setup_one(struct ata_q
sg->length, qc->pad_len);
}
+ if (!sg->length) {
+ sg_dma_address(sg) = 0;
+ goto skip_map;
+ }
+
dma_address = dma_map_single(ap->host_set->dev, qc->buf_virt,
sg->length, dir);
if (dma_mapping_error(dma_address)) {
@@ -2628,6 +2636,7 @@ static int ata_sg_setup_one(struct ata_q
}
sg_dma_address(sg) = dma_address;
+skip_map:
sg_dma_len(sg) = sg->length;
DPRINTK("mapped buffer of %d bytes for %s\n", sg_dma_len(sg),
@@ -2655,7 +2664,7 @@ static int ata_sg_setup(struct ata_queue
struct ata_port *ap = qc->ap;
struct scatterlist *sg = qc->__sg;
struct scatterlist *lsg = &sg[qc->n_elem - 1];
- int n_elem, dir;
+ int n_elem, pre_n_elem, dir, trim_sg = 0;
VPRINTK("ENTER, ata%u\n", ap->id);
assert(qc->flags & ATA_QCFLAG_SG);
@@ -2689,13 +2698,24 @@ static int ata_sg_setup(struct ata_queue
sg_dma_len(psg) = ATA_DMA_PAD_SZ;
/* trim last sg */
lsg->length -= qc->pad_len;
+ if (lsg->length == 0)
+ trim_sg = 1;
DPRINTK("padding done, sg[%d].length=%u pad_len=%u\n",
qc->n_elem - 1, lsg->length, qc->pad_len);
}
+ pre_n_elem = qc->n_elem;
+ if (trim_sg && pre_n_elem)
+ pre_n_elem--;
+
+ if (!pre_n_elem) {
+ n_elem = 0;
+ goto skip_map;
+ }
+
dir = qc->dma_dir;
- n_elem = dma_map_sg(ap->host_set->dev, sg, qc->n_elem, dir);
+ n_elem = dma_map_sg(ap->host_set->dev, sg, pre_n_elem, dir);
if (n_elem < 1) {
/* restore last sg */
lsg->length += qc->pad_len;
@@ -2704,6 +2724,7 @@ static int ata_sg_setup(struct ata_queue
DPRINTK("%d sg elements mapped\n", n_elem);
+skip_map:
qc->n_elem = n_elem;
return 0;
@@ -3263,32 +3284,11 @@ static void ata_qc_timeout(struct ata_qu
{
struct ata_port *ap = qc->ap;
struct ata_host_set *host_set = ap->host_set;
- struct ata_device *dev = qc->dev;
u8 host_stat = 0, drv_stat;
unsigned long flags;
DPRINTK("ENTER\n");
- /* FIXME: doesn't this conflict with timeout handling? */
- if (qc->dev->class == ATA_DEV_ATAPI && qc->scsicmd) {
- struct scsi_cmnd *cmd = qc->scsicmd;
-
- if (!(cmd->eh_eflags & SCSI_EH_CANCEL_CMD)) {
-
- /* finish completing original command */
- spin_lock_irqsave(&host_set->lock, flags);
- __ata_qc_complete(qc);
- spin_unlock_irqrestore(&host_set->lock, flags);
-
- atapi_request_sense(ap, dev, cmd);
-
- cmd->result = (CHECK_CONDITION << 1) | (DID_OK << 16);
- scsi_finish_command(cmd);
-
- goto out;
- }
- }
-
spin_lock_irqsave(&host_set->lock, flags);
/* hack alert! We cannot use the supplied completion
@@ -3327,7 +3327,6 @@ static void ata_qc_timeout(struct ata_qu
spin_unlock_irqrestore(&host_set->lock, flags);
-out:
DPRINTK("EXIT\n");
}
@@ -3411,16 +3410,11 @@ struct ata_queued_cmd *ata_qc_new_init(s
qc = ata_qc_new(ap);
if (qc) {
- qc->__sg = NULL;
- qc->flags = 0;
qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
- qc->cursect = qc->cursg = qc->cursg_ofs = 0;
- qc->nsect = 0;
- qc->nbytes = qc->curbytes = 0;
- ata_tf_init(ap, &qc->tf, dev->devno);
+ ata_qc_reinit(qc);
}
return qc;
diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c
index 0df4b68..3b4ca55 100644
--- a/drivers/scsi/libata-scsi.c
+++ b/drivers/scsi/libata-scsi.c
@@ -1955,22 +1955,44 @@ void ata_scsi_badcmd(struct scsi_cmnd *c
done(cmd);
}
-void atapi_request_sense(struct ata_port *ap, struct ata_device *dev,
- struct scsi_cmnd *cmd)
+static int atapi_sense_complete(struct ata_queued_cmd *qc,unsigned int err_mask)
{
- DECLARE_COMPLETION(wait);
- struct ata_queued_cmd *qc;
- unsigned long flags;
- int rc;
+ if (err_mask && ((err_mask & AC_ERR_DEV) == 0))
+ /* FIXME: not quite right; we don't want the
+ * translation of taskfile registers into
+ * a sense descriptors, since that's only
+ * correct for ATA, not ATAPI
+ */
+ ata_gen_ata_desc_sense(qc);
- DPRINTK("ATAPI request sense\n");
+ qc->scsidone(qc->scsicmd);
+ return 0;
+}
+
+/* is it pointless to prefer PIO for "safety reasons"? */
+static inline int ata_pio_use_silly(struct ata_port *ap)
+{
+ return (ap->flags & ATA_FLAG_PIO_DMA);
+}
- qc = ata_qc_new_init(ap, dev);
- BUG_ON(qc == NULL);
+static void atapi_request_sense(struct ata_queued_cmd *qc)
+{
+ struct ata_port *ap = qc->ap;
+ struct scsi_cmnd *cmd = qc->scsicmd;
+
+ DPRINTK("ATAPI request sense\n");
/* FIXME: is this needed? */
memset(cmd->sense_buffer, 0, sizeof(cmd->sense_buffer));
+ ap->ops->tf_read(ap, &qc->tf);
+
+ /* fill these in, for the case where they are -not- overwritten */
+ cmd->sense_buffer[0] = 0x70;
+ cmd->sense_buffer[2] = qc->tf.feature >> 4;
+
+ ata_qc_reinit(qc);
+
ata_sg_init_one(qc, cmd->sense_buffer, sizeof(cmd->sense_buffer));
qc->dma_dir = DMA_FROM_DEVICE;
@@ -1981,22 +2003,20 @@ void atapi_request_sense(struct ata_port
qc->tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
qc->tf.command = ATA_CMD_PACKET;
- qc->tf.protocol = ATA_PROT_ATAPI;
- qc->tf.lbam = (8 * 1024) & 0xff;
- qc->tf.lbah = (8 * 1024) >> 8;
+ if (ata_pio_use_silly(ap)) {
+ qc->tf.protocol = ATA_PROT_ATAPI_DMA;
+ qc->tf.feature |= ATAPI_PKT_DMA;
+ } else {
+ qc->tf.protocol = ATA_PROT_ATAPI;
+ qc->tf.lbam = (8 * 1024) & 0xff;
+ qc->tf.lbah = (8 * 1024) >> 8;
+ }
qc->nbytes = SCSI_SENSE_BUFFERSIZE;
- qc->waiting = &wait;
- qc->complete_fn = ata_qc_complete_noop;
-
- spin_lock_irqsave(&ap->host_set->lock, flags);
- rc = ata_qc_issue(qc);
- spin_unlock_irqrestore(&ap->host_set->lock, flags);
+ qc->complete_fn = atapi_sense_complete;
- if (rc)
- ata_port_disable(ap);
- else
- wait_for_completion(&wait);
+ if (ata_qc_issue(qc))
+ ata_qc_complete(qc, AC_ERR_OTHER);
DPRINTK("EXIT\n");
}
@@ -2008,19 +2028,8 @@ static int atapi_qc_complete(struct ata_
VPRINTK("ENTER, err_mask 0x%X\n", err_mask);
if (unlikely(err_mask & AC_ERR_DEV)) {
- DPRINTK("request check condition\n");
-
- /* FIXME: command completion with check condition
- * but no sense causes the error handler to run,
- * which then issues REQUEST SENSE, fills in the sense
- * buffer, and completes the command (for the second
- * time). We need to issue REQUEST SENSE some other
- * way, to avoid completing the command twice.
- */
cmd->result = SAM_STAT_CHECK_CONDITION;
-
- qc->scsidone(cmd);
-
+ atapi_request_sense(qc);
return 1;
}
diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h
index fad051c..74a84e0 100644
--- a/drivers/scsi/libata.h
+++ b/drivers/scsi/libata.h
@@ -54,8 +54,6 @@ extern int ata_cmd_ioctl(struct scsi_dev
/* libata-scsi.c */
-extern void atapi_request_sense(struct ata_port *ap, struct ata_device *dev,
- struct scsi_cmnd *cmd);
extern void ata_scsi_scan_host(struct ata_port *ap);
extern int ata_scsi_error(struct Scsi_Host *host);
extern unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index ad59961..f2dbb68 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -59,6 +59,8 @@
#define VPRINTK(fmt, args...)
#endif /* ATA_DEBUG */
+#define BPRINTK(fmt, args...) if (ap->flags & ATA_FLAG_DEBUGMSG) printk(KERN_ERR "%s: " fmt, __FUNCTION__, ## args)
+
#ifdef ATA_NDEBUG
#define assert(expr)
#else
@@ -119,6 +121,7 @@ enum {
ATA_FLAG_PIO_DMA = (1 << 8), /* PIO cmds via DMA */
ATA_FLAG_NOINTR = (1 << 9), /* FIXME: Remove this once
* proper HSM is in place. */
+ ATA_FLAG_DEBUGMSG = (1 << 10),
ATA_QCFLAG_ACTIVE = (1 << 1), /* cmd not yet ack'd to scsi lyer */
ATA_QCFLAG_SG = (1 << 3), /* have s/g table? */
@@ -659,6 +662,17 @@ static inline void ata_tf_init(struct at
tf->device = ATA_DEVICE_OBS | ATA_DEV1;
}
+static inline void ata_qc_reinit(struct ata_queued_cmd *qc)
+{
+ qc->__sg = NULL;
+ qc->flags = 0;
+ qc->cursect = qc->cursg = qc->cursg_ofs = 0;
+ qc->nsect = 0;
+ qc->nbytes = qc->curbytes = 0;
+
+ ata_tf_init(qc->ap, &qc->tf, qc->dev->devno);
+}
+
/**
* ata_irq_on - Enable interrupts on a port.
On Mon, Nov 14, 2005 at 02:57:17PM -0500, Jeff Garzik wrote:
>
> I finally got ATAPI working on my x86-64 AHCI box.
>
> Just checked the following changes into the 'upstream-fixes' branch of
> rsync://rsync.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
>
Hi, Jeff.
I've been trying to do about the same thing and here's my version
which only fixes error/sense handling. This patch makes ahci's
eng_timeout act correctly w.r.t ATAPI sense requesting. libata ATAPI
request sensing mechanism was not broken. What's broken was AHCI's
eng_timeout handler. And, yes, this problem disappears with request
sensing via active qc turn-around in your patch.
As I've written several times, I'm not a big fan of sense requesting
by turning around active qc. For libata's EH to work any better,
handing-over failed commands to EH is necessary with or without
request sensing. Hmm... It's true that the current implementation
used for ATAPI request sensing needs improvements. Remember those
patches which implement generic failed command handover and perform
request sensing from EH with proper timeout and synchronization?
Ah.. also, I'm working on sil24 ATAPI support. However, it seems that
I need to do a little bit more; unfortunately, I'm not sure which....
INQUIRY succeeds and then my drive fails the first TUR with SK 6h
(UNIT ATTENTION) which is probably due to previous phy reset. Then,
my drive fails to repsond to following REQUEST SENSE.
With my patched AHCI, REQUEST SENSE works and after SK 06h and several
SK 02h's (NOT READY), it comes online and works okay.
As INQUIRY works okay, my hunch is that I'm not performing TUR
properly (that is ATAPI command with no data) and the drive locks up
after it. I'll fool around with this more and report.
Thanks.
diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c
index 4e96ec5..2d13699 100644
--- a/drivers/scsi/ahci.c
+++ b/drivers/scsi/ahci.c
@@ -611,16 +611,18 @@ static void ahci_eng_timeout(struct ata_
struct ata_host_set *host_set = ap->host_set;
void __iomem *mmio = host_set->mmio_base;
void __iomem *port_mmio = ahci_port_base(mmio, ap->port_no);
- struct ata_queued_cmd *qc;
+ struct ata_queued_cmd *qc = ata_qc_from_tag(ap, ap->active_tag);
unsigned long flags;
DPRINTK("ENTER\n");
+ if (ata_qc_atapi_timeout(qc))
+ goto out;
+
spin_lock_irqsave(&host_set->lock, flags);
ahci_intr_error(ap, readl(port_mmio + PORT_IRQ_STAT));
- qc = ata_qc_from_tag(ap, ap->active_tag);
if (!qc) {
printk(KERN_ERR "ata%u: BUG: timeout without command\n",
ap->id);
@@ -636,6 +638,9 @@ static void ahci_eng_timeout(struct ata_
}
spin_unlock_irqrestore(&host_set->lock, flags);
+
+ out:
+ DPRINTK("LEAVE\n");
}
static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc)
@@ -660,8 +665,18 @@ static inline int ahci_host_intr(struct
if (status & PORT_IRQ_FATAL) {
ahci_intr_error(ap, status);
- if (qc)
- ata_qc_complete(qc, AC_ERR_OTHER);
+ if (qc) {
+ unsigned int err;
+ if (status & PORT_IRQ_TF_ERR)
+ err = AC_ERR_DEV;
+ else if (status & (PORT_IRQ_HBUS_ERR | PORT_IRQ_HBUS_DATA_ERR))
+ err = AC_ERR_HOST_BUS;
+ else if (status & (PORT_IRQ_IF_ERR))
+ err = AC_ERR_ATA_BUS;
+ else
+ err = AC_ERR_OTHER;
+ ata_qc_complete(qc, err);
+ }
}
return 1;
diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c
index f606bdd..1bf8479 100644
--- a/drivers/scsi/libata-core.c
+++ b/drivers/scsi/libata-core.c
@@ -3421,6 +3421,33 @@ fsm_start:
goto fsm_start;
}
+int ata_qc_atapi_timeout(struct ata_queued_cmd *qc)
+{
+ struct ata_port *ap = qc->ap;
+ struct ata_host_set *host_set = ap->host_set;
+ struct ata_device *dev = qc->dev;
+ struct scsi_cmnd *cmd = qc->scsicmd;
+ unsigned long flags;
+
+ if (qc->dev->class != ATA_DEV_ATAPI || cmd == NULL)
+ return 0;
+
+ if (cmd->eh_eflags & SCSI_EH_CANCEL_CMD)
+ return 0;
+
+ /* finish completing original command */
+ spin_lock_irqsave(&host_set->lock, flags);
+ __ata_qc_complete(qc);
+ spin_unlock_irqrestore(&host_set->lock, flags);
+
+ atapi_request_sense(ap, dev, cmd);
+
+ cmd->result = (CHECK_CONDITION << 1) | (DID_OK << 16);
+ scsi_finish_command(cmd);
+
+ return 1;
+}
+
/**
* ata_qc_timeout - Handle timeout of queued command
* @qc: Command that timed out
@@ -3444,31 +3471,13 @@ static void ata_qc_timeout(struct ata_qu
{
struct ata_port *ap = qc->ap;
struct ata_host_set *host_set = ap->host_set;
- struct ata_device *dev = qc->dev;
u8 host_stat = 0, drv_stat;
unsigned long flags;
DPRINTK("ENTER\n");
- /* FIXME: doesn't this conflict with timeout handling? */
- if (qc->dev->class == ATA_DEV_ATAPI && qc->scsicmd) {
- struct scsi_cmnd *cmd = qc->scsicmd;
-
- if (!(cmd->eh_eflags & SCSI_EH_CANCEL_CMD)) {
-
- /* finish completing original command */
- spin_lock_irqsave(&host_set->lock, flags);
- __ata_qc_complete(qc);
- spin_unlock_irqrestore(&host_set->lock, flags);
-
- atapi_request_sense(ap, dev, cmd);
-
- cmd->result = (CHECK_CONDITION << 1) | (DID_OK << 16);
- scsi_finish_command(cmd);
-
- goto out;
- }
- }
+ if (ata_qc_atapi_timeout(qc))
+ goto out;
spin_lock_irqsave(&host_set->lock, flags);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 79b7e6f..a10be07 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -478,6 +478,7 @@ extern u8 ata_bmdma_status(struct ata_
extern void ata_bmdma_irq_clear(struct ata_port *ap);
extern void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask);
extern void ata_eng_timeout(struct ata_port *ap);
+extern int ata_qc_atapi_timeout(struct ata_queued_cmd *qc);
extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd,
void (*done)(struct scsi_cmnd *));
extern int ata_std_bios_param(struct scsi_device *sdev,
Tejun Heo wrote:
> I've been trying to do about the same thing and here's my version
> which only fixes error/sense handling. This patch makes ahci's
> eng_timeout act correctly w.r.t ATAPI sense requesting. libata ATAPI
> request sensing mechanism was not broken. What's broken was AHCI's
> eng_timeout handler. And, yes, this problem disappears with request
> sensing via active qc turn-around in your patch.
Yes, that's intentional. I fixed the problem in AHCI by copying code
from libata-core's ata_qc_timeout(), similar to what you did. Didn't
like that solution at all, so I started over from scratch.
> As I've written several times, I'm not a big fan of sense requesting
> by turning around active qc. For libata's EH to work any better,
> handing-over failed commands to EH is necessary with or without
> request sensing. Hmm... It's true that the current implementation
> used for ATAPI request sensing needs improvements. Remember those
> patches which implement generic failed command handover and perform
> request sensing from EH with proper timeout and synchronization?
We'll see how things shake out in the future.
One of the reasons I haven't responded to your ATA exceptions doc RFC is
that I don't like to plan. Linux isn't about roadmaps, it's about
what's the next-best-step. I think my current EH fixes are the next
best step, as it cleans up the error handler a lot, and gets things
working. Next step after that? No idea. I just go on gut feeling I
have based on direction.
The current feeling is that we should move away from complex
dependencies on the SCSI EH layer, and do all our own error handling
(perhaps in ata_wq). This will allow use of libata as a block driver.
> Ah.. also, I'm working on sil24 ATAPI support. However, it seems that
> I need to do a little bit more; unfortunately, I'm not sure which....
> INQUIRY succeeds and then my drive fails the first TUR with SK 6h
> (UNIT ATTENTION) which is probably due to previous phy reset. Then,
> my drive fails to repsond to following REQUEST SENSE.
>
> With my patched AHCI, REQUEST SENSE works and after SK 06h and several
> SK 02h's (NOT READY), it comes online and works okay.
>
> As INQUIRY works okay, my hunch is that I'm not performing TUR
> properly (that is ATAPI command with no data) and the drive locks up
> after it. I'll fool around with this more and report.
TUR fails for me, too. It triggers the EH code, which is why I wound up
needing to fix it :)
Have you dumped the D2H FIS returned by the drive, when the TUR fails?
What does it look like? What does the D2H FIS look like after REQUEST
SENSE?
I bet there is some "clear error" actions we need to take on sil24,
before it work there.
Jeff
Jeff Garzik wrote:
> Tejun Heo wrote:
>
>> I've been trying to do about the same thing and here's my version
>> which only fixes error/sense handling. This patch makes ahci's
>> eng_timeout act correctly w.r.t ATAPI sense requesting. libata ATAPI
>> request sensing mechanism was not broken. What's broken was AHCI's
>> eng_timeout handler. And, yes, this problem disappears with request
>> sensing via active qc turn-around in your patch.
>
>
> Yes, that's intentional. I fixed the problem in AHCI by copying code
> from libata-core's ata_qc_timeout(), similar to what you did. Didn't
> like that solution at all, so I started over from scratch.
>
>
>> As I've written several times, I'm not a big fan of sense requesting
>> by turning around active qc. For libata's EH to work any better,
>> handing-over failed commands to EH is necessary with or without
>> request sensing. Hmm... It's true that the current implementation
>> used for ATAPI request sensing needs improvements. Remember those
>> patches which implement generic failed command handover and perform
>> request sensing from EH with proper timeout and synchronization?
>
>
> We'll see how things shake out in the future.
>
> One of the reasons I haven't responded to your ATA exceptions doc RFC is
> that I don't like to plan. Linux isn't about roadmaps, it's about
> what's the next-best-step. I think my current EH fixes are the next
> best step, as it cleans up the error handler a lot, and gets things
> working. Next step after that? No idea. I just go on gut feeling I
> have based on direction.
My RFC was written *after* the patches to ease the reviewing/integration
process. Well, that was the intention. I understand your point but I
think that EH needs some big changes, so it might be beneficial to
establish some consensus such that other developers can do stuff that
can be accepted.
>
> The current feeling is that we should move away from complex
> dependencies on the SCSI EH layer, and do all our own error handling
> (perhaps in ata_wq). This will allow use of libata as a block driver.
>
For departure of libata from SCSI, I was thinking more of another more
generic block device framework in which libata can live in. And I
thought that it was reasonable to assume that the framework would supply
a EH mechanism which supports queue stalling/draining and separate
thread. So, my EH patches tried to make the same environment for libata
so that it doesn't have to be changed drastically later. All those
glueing codes were separated in one or two functions.
The point I'm trying to make here is not that my EH design should be
accepted or anything but that without established consensus it's very
difficult for contributing developers like me to develop substantial
part for libata.
One more thing that bothers me is that even with splitted patches and
detailed document, I couldn't get a discussion started on the mailing
list. No discussion, no consensus. I think we need to improve things
on this front.
>
>> Ah.. also, I'm working on sil24 ATAPI support. However, it seems that
>> I need to do a little bit more; unfortunately, I'm not sure which....
>> INQUIRY succeeds and then my drive fails the first TUR with SK 6h
>> (UNIT ATTENTION) which is probably due to previous phy reset. Then,
>> my drive fails to repsond to following REQUEST SENSE.
>>
>> With my patched AHCI, REQUEST SENSE works and after SK 06h and several
>> SK 02h's (NOT READY), it comes online and works okay.
>>
>> As INQUIRY works okay, my hunch is that I'm not performing TUR
>> properly (that is ATAPI command with no data) and the drive locks up
>> after it. I'll fool around with this more and report.
>
>
> TUR fails for me, too. It triggers the EH code, which is why I wound up
> needing to fix it :)
>
> Have you dumped the D2H FIS returned by the drive, when the TUR fails?
> What does it look like? What does the D2H FIS look like after REQUEST
> SENSE?
>
> I bet there is some "clear error" actions we need to take on sil24,
> before it work there.
Working on it, will report soon.
Thanks. :-)
--
tejun
Tejun Heo wrote:
> My RFC was written *after* the patches to ease the reviewing/integration
> process. Well, that was the intention. I understand your point but I
> think that EH needs some big changes, so it might be beneficial to
Agreed, libata EH still needs a lot of work. ATA device error handling
could better, ATA bus and PCI bus errors need to be handled a lot
better, etc.
> establish some consensus such that other developers can do stuff that
> can be accepted.
>
>>
>> The current feeling is that we should move away from complex
>> dependencies on the SCSI EH layer, and do all our own error handling
>> (perhaps in ata_wq). This will allow use of libata as a block driver.
>>
>
> For departure of libata from SCSI, I was thinking more of another more
> generic block device framework in which libata can live in. And I
> thought that it was reasonable to assume that the framework would supply
> a EH mechanism which supports queue stalling/draining and separate
> thread. So, my EH patches tried to make the same environment for libata
A big reason why libata uses the SCSI layer is infrastructure like this.
It would certainly be nice to see timeouts and EH at the block layer.
The block layer itself already supports queue stalling/draining.
> so that it doesn't have to be changed drastically later. All those
> glueing codes were separated in one or two functions.
>
> The point I'm trying to make here is not that my EH design should be
> accepted or anything but that without established consensus it's very
> difficult for contributing developers like me to develop substantial
> part for libata.
>
> One more thing that bothers me is that even with splitted patches and
> detailed document, I couldn't get a discussion started on the mailing
> list. No discussion, no consensus. I think we need to improve things
> on this front.
Honestly there isn't a ton of discussion beforehand about anything in
libata :) It's mainly just like Linux itself... no roadmap, its more
just about what patches are accepted. The consensus is the code. I
have a general idea of where I want libata to go, in my mind, but that
picture is very rough and fuzzy :) I let evolution sort out the details.
Speaking specifically, changes to libata's error handling should have
very practical, observable effects. Error handling code paths are
travelled so infrequently by users, that changes for the general purpose
of "improve something ugly" are not interesting. Changes which address
problems people are seeing in the field are top priority:
* kicking the device with SRST, if it continues to signal BSY. If that
fails, do a hard reset with SATA COMRESET or <some hardware-specific
method>. Note we really really really want to prefer SRST to other
methods of reset, as SRST is much nicer to the device, and gives the
device the opportunity to flush its writeback cache.
Note that some hardware may prefer that you use a hardware-specific
method of issuing SRST, so that it can reset internal state machines
along with actually issuing the SRST to the device. This probably means
a new protocol, ATA_PROT_SRST (which aligns nicely with SCSI SAT spec).
* retrying rather than cancelling commands that fail due to pci/dma/crc
error. some hardware (PCI IDE BMDMA) indicates DMA/PCI errors via
timeout. Other hardware is nicer, and indicates such via interrupt.
This may involve exporting scsi_eh_finish_cmd() and
scsi_eh_flush_done_q(), and un-exporting scsi_finish_command(). Hence
my reluctance to expand the use of scsi_finish_command(), which is
really inadequate for our purposes.
This is also necessary for NCQ error handling.
* better use of SError. some vendor drivers read+write SError on every
disk transaction, but I think that's overkill. Reading SError to check
for errors, against a stored mask, might not be a bad idea.
* longer term: add logic to "downshift" UDMA mode and/or SATA speed, if
CRC/DMA errors persist.
I'm much less inclined to take patches which don't make progress towards
any of these IMO critical areas of EH importance.
Jeff
On Tue, Nov 15 2005, Jeff Garzik wrote:
> >For departure of libata from SCSI, I was thinking more of another more
> >generic block device framework in which libata can live in. And I
> >thought that it was reasonable to assume that the framework would supply
> >a EH mechanism which supports queue stalling/draining and separate
> >thread. So, my EH patches tried to make the same environment for libata
>
> A big reason why libata uses the SCSI layer is infrastructure like this.
> It would certainly be nice to see timeouts and EH at the block layer.
> The block layer itself already supports queue stalling/draining.
I have a pretty simple plan for this:
- Add a timer to struct request. It already has a timeout field for
SG_IO originated requests, we could easily utilize this in general.
I'm not sure how the querying of timeout would happen so far, it would
probably require a q->set_rq_timeout() hook to ask the low level
driver to set/return rq->timeout for a given request.
- Add a timeout hook to struct request_queue that would get invoked from
the timeout handler. Something along the lines of:
- Timeout on a request happens. Freeze the queue and use
kblockd to take the actual timeout into process context, where
we call the queue ->rq_timeout() hook. Unfreeze/reschedule
queue operations based on what the ->rq_timeout() hook tells
us.
That is generic enough to be able to arm the timeout automatically from
->elevator_activate_req_fn() and dearm it when it completes or gets
deactivated. It should also be possible to implement the SCSI error
handling on top of that.
--
Jens Axboe
Jeff Garzik wrote:
>
> TUR fails for me, too. It triggers the EH code, which is why I wound up
> needing to fix it :)
>
> Have you dumped the D2H FIS returned by the drive, when the TUR fails?
> What does it look like? What does the D2H FIS look like after REQUEST
> SENSE?
>
> I bet there is some "clear error" actions we need to take on sil24,
> before it work there.
>
Okay, I finally got my SATA dvd writer working and successfully mounted
a debian cdrom (with some number of gloss hacks). I had to do the
following to get this thing to work.
* prb->fis doesn't contain proper signature after phy reset. I just
forced DEV_ATAPI as a temporary measure. In the original sii driver
where SRST is used instead of PHY RESET, my ATAPI drive registered
correct signature but my harddisk's signature didn't show up.
* As all commands hang after failed TUR, which BTW fails rightfully with
06h (UNIT ATTENTION due to prior reset) and then 02h's (NOT READY), I
avoided TUR failures by issuing REQUEST SENSE first, which returns NO
SENSE but does clear UA condition, and then waiting for more than 10secs
to give the drive time to become ready.
* With above two changes, no command fails and I can mount the cdrom.
------------------
And here are things that I've found out so far....
* ATAPI commands without data (TEST_UNIT_READY, ALLOW_MEDIUM_REMOVAL...)
work happily with any prb->ctrl flag (no flag, PRB_CTRL_PACKET_READ or
PRB_CTRL_PACKET_WRITE).
* Currently, sil24_reset_controller() is called on every error interrupt
to make sure the controller is ready for further operation.
Unfortunately, the current reset_controller() seems to reset PHY too.
The drive becomes NOT READY after reset_controller(). So, when TUR
fails, it falls into TUR, fail with NOT READY, reset which makes the
drive NOT READY again cycle.
* Wihtout sil24_reset_controller(), no further command can be issued.
The controller doesn't seem to be operating normally after DEV_ERR.
--------------------
So, what we need to know to make sil24 work happily with ATAPI devices.
* How to get device signature on initialization.
* How to make the controller operational again after a DEV_ERR without
affecting the attached device.
--
tejun
Tejun Heo wrote:
> * prb->fis doesn't contain proper signature after phy reset. I just
This makes sense. Each PRB slot in the card's RAM only contains
something useful after you have submitted a PRB, and had some results
returned.
> * As all commands hang after failed TUR, which BTW fails rightfully with
> 06h (UNIT ATTENTION due to prior reset) and then 02h's (NOT READY), I
> avoided TUR failures by issuing REQUEST SENSE first, which returns NO
> SENSE but does clear UA condition, and then waiting for more than 10secs
> to give the drive time to become ready.
10 seconds is certainly a long time.
grep for msleep(150) in libata code, though.
> * With above two changes, no command fails and I can mount the cdrom.
>
> ------------------
>
> And here are things that I've found out so far....
>
> * ATAPI commands without data (TEST_UNIT_READY, ALLOW_MEDIUM_REMOVAL...)
> work happily with any prb->ctrl flag (no flag, PRB_CTRL_PACKET_READ or
> PRB_CTRL_PACKET_WRITE).
I suppose no-flag is the right answer, then...
> * Currently, sil24_reset_controller() is called on every error interrupt
> to make sure the controller is ready for further operation.
> Unfortunately, the current reset_controller() seems to reset PHY too.
> The drive becomes NOT READY after reset_controller(). So, when TUR
> fails, it falls into TUR, fail with NOT READY, reset which makes the
> drive NOT READY again cycle.
Standard init sequence includes checking SStatus, then if a device is
present, waiting until Port Ready (bit 31) of PORT_CTRL_STAT is set. It
doesn't look like sata_sil24 does that.
> * Wihtout sil24_reset_controller(), no further command can be issued.
> The controller doesn't seem to be operating normally after DEV_ERR.
The port stops, when any error occurs. For device errors, set
PORT_CS_INIT bit in PORT_CTRL_STAT, then wait for Port Ready (bit 31,
see above).
> So, what we need to know to make sil24 work happily with ATAPI devices.
>
> * How to get device signature on initialization.
AFAICS, you _must_ send a soft reset PRB. This will be needed anyway,
for port multiplier support.
> * How to make the controller operational again after a DEV_ERR without
> affecting the attached device.
Assert PORT_CS_INIT, then wait for Port Ready. __ONLY__ do this for
errors PORT_CERR_DEV and PORT_CERR_SDB.
Jeff
Jens Axboe wrote:
> On Tue, Nov 15 2005, Jeff Garzik wrote:
>
>>>For departure of libata from SCSI, I was thinking more of another more
>>>generic block device framework in which libata can live in. And I
>>>thought that it was reasonable to assume that the framework would supply
>>>a EH mechanism which supports queue stalling/draining and separate
>>>thread. So, my EH patches tried to make the same environment for libata
>>
>>A big reason why libata uses the SCSI layer is infrastructure like this.
>> It would certainly be nice to see timeouts and EH at the block layer.
>> The block layer itself already supports queue stalling/draining.
>
>
> I have a pretty simple plan for this:
>
> - Add a timer to struct request. It already has a timeout field for
> SG_IO originated requests, we could easily utilize this in general.
> I'm not sure how the querying of timeout would happen so far, it would
> probably require a q->set_rq_timeout() hook to ask the low level
> driver to set/return rq->timeout for a given request.
>
> - Add a timeout hook to struct request_queue that would get invoked from
> the timeout handler. Something along the lines of:
>
> - Timeout on a request happens. Freeze the queue and use
> kblockd to take the actual timeout into process context, where
> we call the queue ->rq_timeout() hook. Unfreeze/reschedule
> queue operations based on what the ->rq_timeout() hook tells
> us.
>
> That is generic enough to be able to arm the timeout automatically from
> ->elevator_activate_req_fn() and dearm it when it completes or gets
> deactivated. It should also be possible to implement the SCSI error
> handling on top of that.
>
To disable the timeout would you then have scsi_done call a block layer
function to disarm it then follow the current flow where or do you think
it would be nice to move the scsi softirq code up to block layer. So
scsi_done would call a block layer function that would disarm the timer,
add the request to a block layer softirq list (a list like scsi-ml's
scsi_done_q), and then in the block layer softirq function it could call
a request_queue callout which for scsi-ml's device queue would call
scsi_decide_disposition and return if it wanted the request requeued or
how many sectors completed or to kick off the eh. I had stated on this
for my block layer multipath driver, but can seperate the patches if
this would be useful.
Would ide benefit from running from a softirq and would it be able to
use such a thing?
On Tue, Nov 15 2005, Mike Christie wrote:
> Jens Axboe wrote:
> >On Tue, Nov 15 2005, Jeff Garzik wrote:
> >
> >>>For departure of libata from SCSI, I was thinking more of another more
> >>>generic block device framework in which libata can live in. And I
> >>>thought that it was reasonable to assume that the framework would supply
> >>>a EH mechanism which supports queue stalling/draining and separate
> >>>thread. So, my EH patches tried to make the same environment for libata
> >>
> >>A big reason why libata uses the SCSI layer is infrastructure like this.
> >>It would certainly be nice to see timeouts and EH at the block layer.
> >>The block layer itself already supports queue stalling/draining.
> >
> >
> >I have a pretty simple plan for this:
> >
> >- Add a timer to struct request. It already has a timeout field for
> > SG_IO originated requests, we could easily utilize this in general.
> > I'm not sure how the querying of timeout would happen so far, it would
> > probably require a q->set_rq_timeout() hook to ask the low level
> > driver to set/return rq->timeout for a given request.
> >
> >- Add a timeout hook to struct request_queue that would get invoked from
> > the timeout handler. Something along the lines of:
> >
> > - Timeout on a request happens. Freeze the queue and use
> > kblockd to take the actual timeout into process context, where
> > we call the queue ->rq_timeout() hook. Unfreeze/reschedule
> > queue operations based on what the ->rq_timeout() hook tells
> > us.
> >
> >That is generic enough to be able to arm the timeout automatically from
> >->elevator_activate_req_fn() and dearm it when it completes or gets
> >deactivated. It should also be possible to implement the SCSI error
> >handling on top of that.
> >
>
> To disable the timeout would you then have scsi_done call a block layer
> function to disarm it then follow the current flow where or do you think
> it would be nice to move the scsi softirq code up to block layer. So
> scsi_done would call a block layer function that would disarm the timer,
> add the request to a block layer softirq list (a list like scsi-ml's
> scsi_done_q), and then in the block layer softirq function it could call
> a request_queue callout which for scsi-ml's device queue would call
> scsi_decide_disposition and return if it wanted the request requeued or
> how many sectors completed or to kick off the eh. I had stated on this
> for my block layer multipath driver, but can seperate the patches if
> this would be useful.
Yeah, that was part of my plan as well. I did post such a patch a year
or so ago, in a thread about decreasing ide completion latencies.
> Would ide benefit from running from a softirq and would it be able to
> use such a thing?
It's generally useful as it allows lock free completion from the irq
path, so that's goodness.
--
Jens Axboe
Jeff Garzik wrote:
>
> The port stops, when any error occurs. For device errors, set
> PORT_CS_INIT bit in PORT_CTRL_STAT, then wait for Port Ready (bit 31,
> see above).
>
Yeap, this did the trick. I'm working on SRST/init stuff and I think I
can post patches later today. What workload do you use for testing a
ATAPI device? I'm currently thinking of the following...
* mounting & tarr'ing cdrom & unmount
* repeat above with eject/load
* burning a cdrom
* ripping a music cd with cdparanoia
Any other thing I can try?
--
tejun
Tejun Heo wrote:
> Jeff Garzik wrote:
>
>>
>> The port stops, when any error occurs. For device errors, set
>> PORT_CS_INIT bit in PORT_CTRL_STAT, then wait for Port Ready (bit 31,
>> see above).
>>
>
> Yeap, this did the trick. I'm working on SRST/init stuff and I think I
> can post patches later today. What workload do you use for testing a
> ATAPI device? I'm currently thinking of the following...
>
> * mounting & tarr'ing cdrom & unmount
> * repeat above with eject/load
> * burning a cdrom
> * ripping a music cd with cdparanoia
>
> Any other thing I can try?
I burn Fedora Core CDs and DVDs, and then run
/usr/lib/anaconda-runtime/checkisomd5 --verbose /dev/scd0
FC ISOs, and probably many others as well, appear to contain embedded
checksums. This is a really good test, I've found. I'll burn CDs/DVDs,
then use that to validate them on another machine. Or I'll use
checkisomd5 simply as a test of libata ATAPI itself.
On a side note:
As a scan through tons of operating system code and vendor drivers
shows, as well as behavior I'm seeing on my AHCI + Plextor setup, it is
probably helpful for the OS driver to issue internal retries, if the
command returns NOT READY.
My setup works without it, successfully burning CDs and DVDs, but has
problems with really long commands, and such those which occur when
cdrecord(1) finishes a CD burn, and performs its "fixating" step.
It may also be helpful to issue TUR until NOT READY goes away, in
ata_bus_probe() after ata_set_mode() completes.
Jeff
Jeff Garzik wrote:
>
> My setup works without it, successfully burning CDs and DVDs, but has
> problems with really long commands, and such those which occur when
> cdrecord(1) finishes a CD burn, and performs its "fixating" step.
Is this just a case of too-short of a SCSI timeout on CLOSE_TRACK?
Drivers I have worked on for this generally apply a 2 minute timeout
for that particular operation, and never have to retry.
Cheers
Mark Lord wrote:
> Jeff Garzik wrote:
>
>>
>> My setup works without it, successfully burning CDs and DVDs, but has
>> problems with really long commands, and such those which occur when
>> cdrecord(1) finishes a CD burn, and performs its "fixating" step.
>
>
> Is this just a case of too-short of a SCSI timeout on CLOSE_TRACK?
> Drivers I have worked on for this generally apply a 2 minute timeout
> for that particular operation, and never have to retry.
I haven't narrowed it down yet. I see the timeout handler kick in, when
burning CDs and on one other random occasion. There is clearly a
timeout real/expectations mismatch somewhere.
Jeff
On Tue, Nov 15 2005, Jens Axboe wrote:
> On Tue, Nov 15 2005, Mike Christie wrote:
> > Jens Axboe wrote:
> > >On Tue, Nov 15 2005, Jeff Garzik wrote:
> > >
> > >>>For departure of libata from SCSI, I was thinking more of another more
> > >>>generic block device framework in which libata can live in. And I
> > >>>thought that it was reasonable to assume that the framework would supply
> > >>>a EH mechanism which supports queue stalling/draining and separate
> > >>>thread. So, my EH patches tried to make the same environment for libata
> > >>
> > >>A big reason why libata uses the SCSI layer is infrastructure like this.
> > >>It would certainly be nice to see timeouts and EH at the block layer.
> > >>The block layer itself already supports queue stalling/draining.
> > >
> > >
> > >I have a pretty simple plan for this:
> > >
> > >- Add a timer to struct request. It already has a timeout field for
> > > SG_IO originated requests, we could easily utilize this in general.
> > > I'm not sure how the querying of timeout would happen so far, it would
> > > probably require a q->set_rq_timeout() hook to ask the low level
> > > driver to set/return rq->timeout for a given request.
> > >
> > >- Add a timeout hook to struct request_queue that would get invoked from
> > > the timeout handler. Something along the lines of:
> > >
> > > - Timeout on a request happens. Freeze the queue and use
> > > kblockd to take the actual timeout into process context, where
> > > we call the queue ->rq_timeout() hook. Unfreeze/reschedule
> > > queue operations based on what the ->rq_timeout() hook tells
> > > us.
> > >
> > >That is generic enough to be able to arm the timeout automatically from
> > >->elevator_activate_req_fn() and dearm it when it completes or gets
> > >deactivated. It should also be possible to implement the SCSI error
> > >handling on top of that.
> > >
> >
> > To disable the timeout would you then have scsi_done call a block layer
> > function to disarm it then follow the current flow where or do you think
> > it would be nice to move the scsi softirq code up to block layer. So
> > scsi_done would call a block layer function that would disarm the timer,
> > add the request to a block layer softirq list (a list like scsi-ml's
> > scsi_done_q), and then in the block layer softirq function it could call
> > a request_queue callout which for scsi-ml's device queue would call
> > scsi_decide_disposition and return if it wanted the request requeued or
> > how many sectors completed or to kick off the eh. I had stated on this
> > for my block layer multipath driver, but can seperate the patches if
> > this would be useful.
>
> Yeah, that was part of my plan as well. I did post such a patch a year
> or so ago, in a thread about decreasing ide completion latencies.
>
> > Would ide benefit from running from a softirq and would it be able to
> > use such a thing?
>
> It's generally useful as it allows lock free completion from the irq
> path, so that's goodness.
I updated that patch, and converted IDE and SCSI to use it. See the
results here:
http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
The main change from the version posted last october is killing the
'slightly' overdesigned completion queue hashing.
--
Jens Axboe
Jens Axboe wrote:
> I updated that patch, and converted IDE and SCSI to use it. See the
> results here:
>
> http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
>
> The main change from the version posted last october is killing the
> 'slightly' overdesigned completion queue hashing.
Nifty, I like. Comments:
* use of spin_lock_irq() in all completion paths now makes me nervous.
* certainly it's what SCSI does now, but is a softirq really necessary?
Using a tasklet would kill all that per-cpu code, and notifier.
On Wed, Nov 16 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >I updated that patch, and converted IDE and SCSI to use it. See the
> >results here:
> >
> >http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> >
> >The main change from the version posted last october is killing the
> >'slightly' overdesigned completion queue hashing.
>
> Nifty, I like. Comments:
>
> * use of spin_lock_irq() in all completion paths now makes me nervous.
Should be fine from the paths originating from blk_done_softirq(), as we
know interrupts are enabled in the first place. But generally I agree,
whenever in doubt always always use the irq saving variants.
> * certainly it's what SCSI does now, but is a softirq really necessary?
> Using a tasklet would kill all that per-cpu code, and notifier.
It would work fine with a tasklet of course, but it's going to generate
a _lot_ of traffic on io busy systems so I felt a dedicated softirq was
the way to go.
--
Jens Axboe
Jens Axboe wrote:
> On Wed, Nov 16 2005, Jeff Garzik wrote:
>
>>Jens Axboe wrote:
>>
>>>I updated that patch, and converted IDE and SCSI to use it. See the
>>>results here:
>>>
>>>http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
>>>
>>>The main change from the version posted last october is killing the
>>>'slightly' overdesigned completion queue hashing.
>>
>>Nifty, I like. Comments:
>>
>>* use of spin_lock_irq() in all completion paths now makes me nervous.
>
>
> Should be fine from the paths originating from blk_done_softirq(), as we
> know interrupts are enabled in the first place. But generally I agree,
> whenever in doubt always always use the irq saving variants.
>
>
>>* certainly it's what SCSI does now, but is a softirq really necessary?
>> Using a tasklet would kill all that per-cpu code, and notifier.
>
>
> It would work fine with a tasklet of course, but it's going to generate
> a _lot_ of traffic on io busy systems so I felt a dedicated softirq was
> the way to go.
fair enough. ACK.
Jeff
Jens Axboe wrote:
> I updated that patch, and converted IDE and SCSI to use it. See the
> results here:
>
> http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
>
> The main change from the version posted last october is killing the
> 'slightly' overdesigned completion queue hashing.
Oh yeah: you shouldn't need to make scsi_retry_command() exported.
It's private to scsi, to just publish it in scsi_priv.h.
Tangent: As part of my libata work, I'm thinking about un-exporting
scsi_finish_command(), and instead exporting scsi_eh_flush_done_q() and
scsi_eh_finish_cmd().
Jeff
On Wed, Nov 16 2005, Jeff Garzik wrote:
> Jens Axboe wrote:
> >I updated that patch, and converted IDE and SCSI to use it. See the
> >results here:
> >
> >http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> >
> >The main change from the version posted last october is killing the
> >'slightly' overdesigned completion queue hashing.
>
> Oh yeah: you shouldn't need to make scsi_retry_command() exported.
> It's private to scsi, to just publish it in scsi_priv.h.
Oh right, I'll make that change. Thanks!
> Tangent: As part of my libata work, I'm thinking about un-exporting
> scsi_finish_command(), and instead exporting scsi_eh_flush_done_q() and
> scsi_eh_finish_cmd().
Sounds ok.
--
Jens Axboe
On 11/16/05, Jens Axboe <[email protected]> wrote:
> I updated that patch, and converted IDE and SCSI to use it. See the
> results here:
>
> http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
I like it but:
* "we know it's either an FS or PC request" assumption in
ide_softirq_done() is really wrong
* same with "uptodate = rq->errors"
* blk_complete_request() is called with ide_lock hold but
ide_softirq_done() also takes ide_lock - is this correct?
"There's still room for improvement, as __ide_end_request() really
could drop the lock after getting HWGROUP->rq (why does it need to
hold it in the first place? If ->rq access isn't serialized, we are
screwed anyways)."
ide_preempt? and yes we are screwed...
Bartlomiej
On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> On 11/16/05, Jens Axboe <[email protected]> wrote:
>
> > I updated that patch, and converted IDE and SCSI to use it. See the
> > results here:
> >
> > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
>
> I like it but:
>
> * "we know it's either an FS or PC request" assumption in
> ide_softirq_done() is really wrong
It used to be correct :-)
No, the problem is that I changed the partial stuff to allow the
deletion/putting of the request to work for every type of request. But
it definitely needs some more looking into.
> * same with "uptodate = rq->errors"
Yeah. In general, ->errors needs to be streamlined. It's a huge mess
right now and it's making generic code really hard to do because every
driver does their own weird thing with it.
I'd like for IDE to really do stuff the error in ->errors, and push the
retry or whatever counting into a ->retries instead. Then we can honor
the simple rule of, rq->errors:
< 0, it contains an -Exxxx value
== 0, no errors, uptodate
> 0, not uptodate, no specific error info. Usually 1.
> * blk_complete_request() is called with ide_lock hold but
> ide_softirq_done() also takes ide_lock - is this correct?
blk_complete_request() need not be called with the lock held, in fact it
would be best if it wasn't (no point in holding the lock). But right now
it is in ide, because of the below. ide_softirq_done() always needs to
grab the lock. There are no recursion problems there, ide_softirq_done()
is called out-of-order from the actual completion call.
> "There's still room for improvement, as __ide_end_request() really
> could drop the lock after getting HWGROUP->rq (why does it need to
> hold it in the first place? If ->rq access isn't serialized, we are
> screwed anyways)."
>
> ide_preempt? and yes we are screwed...
Irk it's nasty, since it basically means we have to hold ide_lock over
the entire functions looking at hwgroup->rq.
It's ok for __ide_end_request() to be entered with the ide_lock held,
the costly affair is usually completing the request. Which now happens
outside of the lock.
We could split the completion path in two - if we know this call will
end the request completely, we can drop the lock and call into the
blk_complete_request() stuff for free. We know we need to clear
hwgroup->rq anyways, so we can do it up front and complete the request
'privately'. If we are not fully completing the request, keep the lock
and do the partial completion. The bonus here is that the first case
will be the often taken path by far (always with DMA and no errors), the
other cases are not interesting from a performance perspective.
--
Jens Axboe
On 11/16/05, Jens Axboe <[email protected]> wrote:
> On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > On 11/16/05, Jens Axboe <[email protected]> wrote:
> >
> > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > results here:
> > >
> > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> >
> > I like it but:
> >
> > * "we know it's either an FS or PC request" assumption in
> > ide_softirq_done() is really wrong
>
> It used to be correct :-)
Sorry but it has been always like that,
other requests also pass through ide_end_request()
(which of course needs fixing).
> No, the problem is that I changed the partial stuff to allow the
> deletion/putting of the request to work for every type of request. But
> it definitely needs some more looking into.
>
> > * same with "uptodate = rq->errors"
>
> Yeah. In general, ->errors needs to be streamlined. It's a huge mess
> right now and it's making generic code really hard to do because every
> driver does their own weird thing with it.
Agreed.
> I'd like for IDE to really do stuff the error in ->errors, and push the
> retry or whatever counting into a ->retries instead. Then we can honor
> the simple rule of, rq->errors:
>
> < 0, it contains an -Exxxx value
> == 0, no errors, uptodate
> > 0, not uptodate, no specific error info. Usually 1.
This is a very good idea.
> > * blk_complete_request() is called with ide_lock hold but
> > ide_softirq_done() also takes ide_lock - is this correct?
>
> blk_complete_request() need not be called with the lock held, in fact it
> would be best if it wasn't (no point in holding the lock). But right now
> it is in ide, because of the below. ide_softirq_done() always needs to
> grab the lock. There are no recursion problems there, ide_softirq_done()
> is called out-of-order from the actual completion call.
>
> > "There's still room for improvement, as __ide_end_request() really
> > could drop the lock after getting HWGROUP->rq (why does it need to
> > hold it in the first place? If ->rq access isn't serialized, we are
> > screwed anyways)."
> >
> > ide_preempt? and yes we are screwed...
>
> Irk it's nasty, since it basically means we have to hold ide_lock over
> the entire functions looking at hwgroup->rq.
>
> It's ok for __ide_end_request() to be entered with the ide_lock held,
> the costly affair is usually completing the request. Which now happens
> outside of the lock.
We should get rid of ide_preempt later.
This will also allow us to remove ide_do_drive_cmd()
and use blk_execute_rq() exclusively.
> We could split the completion path in two - if we know this call will
> end the request completely, we can drop the lock and call into the
> blk_complete_request() stuff for free. We know we need to clear
> hwgroup->rq anyways, so we can do it up front and complete the request
> 'privately'. If we are not fully completing the request, keep the lock
> and do the partial completion. The bonus here is that the first case
> will be the often taken path by far (always with DMA and no errors), the
> other cases are not interesting from a performance perspective.
Sounds good.
Bartlomiej
On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> On 11/16/05, Jens Axboe <[email protected]> wrote:
> > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > >
> > > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > > results here:
> > > >
> > > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> > >
> > > I like it but:
> > >
> > > * "we know it's either an FS or PC request" assumption in
> > > ide_softirq_done() is really wrong
> >
> > It used to be correct :-)
>
> Sorry but it has been always like that,
> other requests also pass through ide_end_request()
> (which of course needs fixing).
You misunderstand, for calls to blk_complete_request() it wasn't true
initially since it always obyed rq_all_done() (which returns 0 for
non-fs and non-pc requests).
> > Irk it's nasty, since it basically means we have to hold ide_lock over
> > the entire functions looking at hwgroup->rq.
> >
> > It's ok for __ide_end_request() to be entered with the ide_lock held,
> > the costly affair is usually completing the request. Which now happens
> > outside of the lock.
>
> We should get rid of ide_preempt later.
>
> This will also allow us to remove ide_do_drive_cmd()
> and use blk_execute_rq() exclusively.
That would be very nice! Reminds me that there might still be a race
with head insertion and REQ_STARTED request in front in the block layer,
that needs inspection. But killing ide_do_drive_cmd() would be very
nice.
--
Jens Axboe
On 11/16/05, Jens Axboe <[email protected]> wrote:
> On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > >
> > > > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > > > results here:
> > > > >
> > > > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> > > >
> > > > I like it but:
> > > >
> > > > * "we know it's either an FS or PC request" assumption in
> > > > ide_softirq_done() is really wrong
> > >
> > > It used to be correct :-)
> >
> > Sorry but it has been always like that,
> > other requests also pass through ide_end_request()
> > (which of course needs fixing).
>
> You misunderstand, for calls to blk_complete_request() it wasn't true
> initially since it always obyed rq_all_done() (which returns 0 for
> non-fs and non-pc requests).
from blk_complete_request() [ the only user of rq_all_done() ]:
+ /*
+ * for partial completions, fall back to normal end io handling.
+ */
+ if (unlikely(!partial_ok && !rq_all_done(req, nbytes)))
+ if (end_that_request_chunk(req, uptodate, nbytes))
+ return 1;
We still will end up with using ide_softirq_done() for !rq_all_done()
case (non FS/PC request) because majority of them (all?) don't use
partial completions.
Bartlomiej
On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> On 11/16/05, Jens Axboe <[email protected]> wrote:
> > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > >
> > > > > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > > > > results here:
> > > > > >
> > > > > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> > > > >
> > > > > I like it but:
> > > > >
> > > > > * "we know it's either an FS or PC request" assumption in
> > > > > ide_softirq_done() is really wrong
> > > >
> > > > It used to be correct :-)
> > >
> > > Sorry but it has been always like that,
> > > other requests also pass through ide_end_request()
> > > (which of course needs fixing).
> >
> > You misunderstand, for calls to blk_complete_request() it wasn't true
> > initially since it always obyed rq_all_done() (which returns 0 for
> > non-fs and non-pc requests).
>
> from blk_complete_request() [ the only user of rq_all_done() ]:
>
> + /*
> + * for partial completions, fall back to normal end io handling.
> + */
> + if (unlikely(!partial_ok && !rq_all_done(req, nbytes)))
> + if (end_that_request_chunk(req, uptodate, nbytes))
> + return 1;
>
> We still will end up with using ide_softirq_done() for !rq_all_done()
> case (non FS/PC request) because majority of them (all?) don't use
> partial completions.
Yes, that's what it looks like now... Note I wrote "wasn't", it used to
look like this:
if (!rq_all_done(req, nbytes)) {
end_that_request_chunk(..);
return;
}
which of course didn't work, so it was changed to the above which then
broke the assumption of what type of requests we expect to see in
ide_softirq_done(). We can't generically handle this case, so it's
probably best to just add this logic to __ide_end_request() - it's just
another case for _not_ using the blk_complete_request() path, just like
the partial case.
--
Jens Axboe
On Wed, Nov 16 2005, Jens Axboe wrote:
> which of course didn't work, so it was changed to the above which then
> broke the assumption of what type of requests we expect to see in
> ide_softirq_done(). We can't generically handle this case, so it's
> probably best to just add this logic to __ide_end_request() - it's just
> another case for _not_ using the blk_complete_request() path, just like
> the partial case.
How about something like this? It requires blk_complete_request() calls
to be full completions of the request (or what is left of it). It cleans
up the calls a lot and fixes the wrong comment for IDE.
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82b7290..8e6dd9f 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3223,33 +3223,20 @@ static struct notifier_block __devinitda
/**
* blk_complete_request - end I/O on a request
* @req: the request being processed
- * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
- * @nbytes: number of bytes to end I/O on
*
* Description:
- * Ends I/O on a number of sectors attached to @req, and sets it up
- * for the next range of segments (if any) in the cluster. This variant
- * completes the request out-of-order through a softirq handler. The user
- * must have registered a completion callback through
- * blk_queue_softirq_fn() at queue init time.
- *
- * Return:
- * 0 - we are done with this request, call end_that_request_last()
- * 1 - still buffers pending for this request
+ * Ends all I/O on a request. It does not handle partial completions,
+ * unless the driver actually implements this in its completionc callback
+ * through requeueing. Theh actual completion happens out-of-order,
+ * through a softirq handler. The user must have registered a completion
+ * callback through blk_queue_softirq_done().
**/
-int blk_complete_request(struct request *req, int uptodate, unsigned int nbytes,
- int partial_ok)
+
+void blk_complete_request(struct request *req)
{
struct list_head *cpu_list;
unsigned long flags;
- /*
- * for partial completions, fall back to normal end io handling.
- */
- if (unlikely(!partial_ok && !rq_all_done(req, nbytes)))
- if (end_that_request_chunk(req, uptodate, nbytes))
- return 1;
-
BUG_ON(!req->q->softirq_done_fn);
local_irq_save(flags);
@@ -3259,7 +3246,6 @@ int blk_complete_request(struct request
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_restore(flags);
- return 0;
}
EXPORT_SYMBOL(blk_complete_request);
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index f114106..1129bfb 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -58,22 +58,9 @@
void ide_softirq_done(struct request *rq)
{
request_queue_t *q = rq->q;
- int nbytes, uptodate = 1;
add_disk_randomness(rq->rq_disk);
-
- /*
- * we know it's either an FS or PC request
- */
- if (blk_fs_request(rq))
- nbytes = rq->hard_nr_sectors << 9;
- else
- nbytes = rq->data_len;
-
- if (rq->errors < 0)
- uptodate = rq->errors;
-
- end_that_request_chunk(rq, uptodate, nbytes);
+ end_that_request_chunk(rq, rq->errors, rq->data_len);
spin_lock_irq(q->queue_lock);
end_that_request_last(rq);
@@ -83,6 +70,9 @@ void ide_softirq_done(struct request *rq
int __ide_end_request(ide_drive_t *drive, struct request *rq, int uptodate,
int nr_sectors)
{
+ unsigned int nbytes;
+ int ret = 1;
+
BUG_ON(!(rq->flags & REQ_STARTED));
/*
@@ -104,13 +94,30 @@ int __ide_end_request(ide_drive_t *drive
HWGROUP(drive)->hwif->ide_dma_on(drive);
}
- if (!blk_complete_request(rq, uptodate, nr_sectors << 9, 0)) {
+ /*
+ * For partial completions (or non fs/pc requests), use the regular
+ * direct completion path.
+ */
+ nbytes = nr_sectors << 9;
+ if (rq_all_done(rq, nbytes)) {
+ rq->errors = uptodate;
+ rq->data_len = nbytes;
+ blk_complete_request(rq);
+ ret = 0;
+ } else {
+ if (!end_that_request_first(rq, uptodate, nr_sectors)) {
+ add_disk_randomness(rq->rq_disk);
+ end_that_request_last(rq);
+ ret = 0;
+ }
+ }
+
+ if (!ret) {
blkdev_dequeue_request(rq);
HWGROUP(drive)->rq = NULL;
- return 0;
}
- return 1;
+ return ret;
}
EXPORT_SYMBOL(__ide_end_request);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 4044ba4..bf902cf 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -751,6 +751,8 @@ static void scsi_done(struct scsi_cmnd *
* isn't running --- used by scsi_times_out */
void __scsi_done(struct scsi_cmnd *cmd)
{
+ struct request *rq = cmd->request;
+
/*
* Set the serial numbers back to zero
*/
@@ -760,14 +762,14 @@ void __scsi_done(struct scsi_cmnd *cmd)
if (cmd->result)
atomic_inc(&cmd->device->ioerr_cnt);
- BUG_ON(!cmd->request);
+ BUG_ON(!rq);
/*
* The uptodate/nbytes values don't matter, as we allow partial
* completes and thus will check this in the softirq callback
*/
- cmd->request->completion_data = cmd;
- blk_complete_request(cmd->request, 0, 0, 1);
+ rq->completion_data = cmd;
+ blk_complete_request(rq);
}
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 588b9cc..3ff7c3f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -613,7 +613,7 @@ extern int end_that_request_first(struct
extern int end_that_request_chunk(struct request *, int, int);
extern void end_that_request_last(struct request *);
extern void end_request(struct request *req, int uptodate);
-extern int blk_complete_request(struct request *, int, unsigned int, int);
+extern void blk_complete_request(struct request *);
static inline int rq_all_done(struct request *rq, unsigned int nr_bytes)
{
--
Jens Axboe
On 11/16/05, Jens Axboe <[email protected]> wrote:
> On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > > >
> > > > > > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > > > > > results here:
> > > > > > >
> > > > > > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> > > > > >
> > > > > > I like it but:
> > > > > >
> > > > > > * "we know it's either an FS or PC request" assumption in
> > > > > > ide_softirq_done() is really wrong
> > > > >
> > > > > It used to be correct :-)
> > > >
> > > > Sorry but it has been always like that,
> > > > other requests also pass through ide_end_request()
> > > > (which of course needs fixing).
> > >
> > > You misunderstand, for calls to blk_complete_request() it wasn't true
> > > initially since it always obyed rq_all_done() (which returns 0 for
> > > non-fs and non-pc requests).
> >
> > from blk_complete_request() [ the only user of rq_all_done() ]:
> >
> > + /*
> > + * for partial completions, fall back to normal end io handling.
> > + */
> > + if (unlikely(!partial_ok && !rq_all_done(req, nbytes)))
> > + if (end_that_request_chunk(req, uptodate, nbytes))
> > + return 1;
> >
> > We still will end up with using ide_softirq_done() for !rq_all_done()
> > case (non FS/PC request) because majority of them (all?) don't use
> > partial completions.
>
> Yes, that's what it looks like now... Note I wrote "wasn't", it used to
> look like this:
>
> if (!rq_all_done(req, nbytes)) {
> end_that_request_chunk(..);
> return;
> }
>
> which of course didn't work, so it was changed to the above which then
> broke the assumption of what type of requests we expect to see in
> ide_softirq_done(). We can't generically handle this case, so it's
> probably best to just add this logic to __ide_end_request() - it's just
> another case for _not_ using the blk_complete_request() path, just like
> the partial case.
Sounds better but I honestly think that you simply cannot obtain
reliable nr_sectors to complete for FS/PC requests just from the
request type. Two examples are: failed disk flush requests and
cd noretry requests (both are of FS type).
IMO the best way to fix it is to actually move more (not less!) of
the logic from driver->end_request() paths to ide_softirq_done().
Cheers,
Bartlomiej
On 11/16/05, Bartlomiej Zolnierkiewicz <[email protected]> wrote:
> On 11/16/05, Jens Axboe <[email protected]> wrote:
> > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > > > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > > > > On 11/16/05, Jens Axboe <[email protected]> wrote:
> > > > > > >
> > > > > > > > I updated that patch, and converted IDE and SCSI to use it. See the
> > > > > > > > results here:
> > > > > > > >
> > > > > > > > http://brick.kernel.dk/git/?p=linux-2.6-block.git;a=shortlog;h=blk-softirq
> > > > > > >
> > > > > > > I like it but:
> > > > > > >
> > > > > > > * "we know it's either an FS or PC request" assumption in
> > > > > > > ide_softirq_done() is really wrong
> > > > > >
> > > > > > It used to be correct :-)
> > > > >
> > > > > Sorry but it has been always like that,
> > > > > other requests also pass through ide_end_request()
> > > > > (which of course needs fixing).
> > > >
> > > > You misunderstand, for calls to blk_complete_request() it wasn't true
> > > > initially since it always obyed rq_all_done() (which returns 0 for
> > > > non-fs and non-pc requests).
> > >
> > > from blk_complete_request() [ the only user of rq_all_done() ]:
> > >
> > > + /*
> > > + * for partial completions, fall back to normal end io handling.
> > > + */
> > > + if (unlikely(!partial_ok && !rq_all_done(req, nbytes)))
> > > + if (end_that_request_chunk(req, uptodate, nbytes))
> > > + return 1;
> > >
> > > We still will end up with using ide_softirq_done() for !rq_all_done()
> > > case (non FS/PC request) because majority of them (all?) don't use
> > > partial completions.
> >
> > Yes, that's what it looks like now... Note I wrote "wasn't", it used to
> > look like this:
> >
> > if (!rq_all_done(req, nbytes)) {
> > end_that_request_chunk(..);
> > return;
> > }
> >
> > which of course didn't work, so it was changed to the above which then
> > broke the assumption of what type of requests we expect to see in
> > ide_softirq_done(). We can't generically handle this case, so it's
> > probably best to just add this logic to __ide_end_request() - it's just
> > another case for _not_ using the blk_complete_request() path, just like
> > the partial case.
>
> Sounds better but I honestly think that you simply cannot obtain
> reliable nr_sectors to complete for FS/PC requests just from the
> request type. Two examples are: failed disk flush requests and
> cd noretry requests (both are of FS type).
first example is bad :-)
> IMO the best way to fix it is to actually move more (not less!) of
> the logic from driver->end_request() paths to ide_softirq_done().
Your latest patch is also a good way to fix it
(now the only thing left is rq->errors/rq->retries discussed earlier).
Bartlomiej
On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > which of course didn't work, so it was changed to the above which then
> > > broke the assumption of what type of requests we expect to see in
> > > ide_softirq_done(). We can't generically handle this case, so it's
> > > probably best to just add this logic to __ide_end_request() - it's just
> > > another case for _not_ using the blk_complete_request() path, just like
> > > the partial case.
> >
> > Sounds better but I honestly think that you simply cannot obtain
> > reliable nr_sectors to complete for FS/PC requests just from the
> > request type. Two examples are: failed disk flush requests and
> > cd noretry requests (both are of FS type).
>
> first example is bad :-)
Both your examples are wrong - a flush request is non-fs/pc, and noretry
requests doesn't impact the type of the request at from this POV.
> > IMO the best way to fix it is to actually move more (not less!) of
> > the logic from driver->end_request() paths to ide_softirq_done().
>
> Your latest patch is also a good way to fix it
> (now the only thing left is rq->errors/rq->retries discussed earlier).
Yeah, that is still pending... I updated the patch series so it's a
clean 1-2-3-4 step series now, there instead of this little additions.
--
Jens Axboe
On 11/16/05, Jens Axboe <[email protected]> wrote:
> On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > which of course didn't work, so it was changed to the above which then
> > > > broke the assumption of what type of requests we expect to see in
> > > > ide_softirq_done(). We can't generically handle this case, so it's
> > > > probably best to just add this logic to __ide_end_request() - it's just
> > > > another case for _not_ using the blk_complete_request() path, just like
> > > > the partial case.
> > >
> > > Sounds better but I honestly think that you simply cannot obtain
> > > reliable nr_sectors to complete for FS/PC requests just from the
> > > request type. Two examples are: failed disk flush requests and
> > > cd noretry requests (both are of FS type).
> >
> > first example is bad :-)
>
> Both your examples are wrong - a flush request is non-fs/pc, and noretry
> requests doesn't impact the type of the request at from this POV.
I meant doing partial completions of the real request from the
->end_flush() and noretry doesn't impact the type but it does
impact the nr_sectors to complete. However none of these
matters any longer with your latest patch which simplifies things
a lot.
> > > IMO the best way to fix it is to actually move more (not less!) of
> > > the logic from driver->end_request() paths to ide_softirq_done().
> >
> > Your latest patch is also a good way to fix it
> > (now the only thing left is rq->errors/rq->retries discussed earlier).
>
> Yeah, that is still pending... I updated the patch series so it's a
> clean 1-2-3-4 step series now, there instead of this little additions.
Excellent!
Bartlomiej
On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> On 11/16/05, Jens Axboe <[email protected]> wrote:
> > On Wed, Nov 16 2005, Bartlomiej Zolnierkiewicz wrote:
> > > > > which of course didn't work, so it was changed to the above which then
> > > > > broke the assumption of what type of requests we expect to see in
> > > > > ide_softirq_done(). We can't generically handle this case, so it's
> > > > > probably best to just add this logic to __ide_end_request() - it's just
> > > > > another case for _not_ using the blk_complete_request() path, just like
> > > > > the partial case.
> > > >
> > > > Sounds better but I honestly think that you simply cannot obtain
> > > > reliable nr_sectors to complete for FS/PC requests just from the
> > > > request type. Two examples are: failed disk flush requests and
> > > > cd noretry requests (both are of FS type).
> > >
> > > first example is bad :-)
> >
> > Both your examples are wrong - a flush request is non-fs/pc, and noretry
> > requests doesn't impact the type of the request at from this POV.
>
> I meant doing partial completions of the real request from the
> ->end_flush() and noretry doesn't impact the type but it does
> impact the nr_sectors to complete. However none of these
> matters any longer with your latest patch which simplifies things
> a lot.
Ah, partial completions due to hitting an error in the flush. Yeah that
would have broken, you are right. And yes, that should work now, it's
the way it should have been written from the get-go.
--
Jens Axboe
On Wed, Nov 16, 2005 at 05:06:45PM +0100, Bartlomiej Zolnierkiewicz wrote:
> This will also allow us to remove ide_do_drive_cmd()
> and use blk_execute_rq() exclusively.
While we're talking about moving things to generic request-based stuff,
any chance one of you could look over to convert ide-cd internal request
submissions to BLOCK_PC? It's the last user of REQ_PC. After that changing
the CD layer to submit BLOCK_PC commands directly instead of the
->packet_command hook would be a logical next step.
On Sat, Nov 19 2005, Christoph Hellwig wrote:
> On Wed, Nov 16, 2005 at 05:06:45PM +0100, Bartlomiej Zolnierkiewicz wrote:
> > This will also allow us to remove ide_do_drive_cmd()
> > and use blk_execute_rq() exclusively.
>
> While we're talking about moving things to generic request-based stuff,
> any chance one of you could look over to convert ide-cd internal request
> submissions to BLOCK_PC? It's the last user of REQ_PC. After that changing
> the CD layer to submit BLOCK_PC commands directly instead of the
> ->packet_command hook would be a logical next step.
Yup, that's clearly the way to go.
--
Jens Axboe