Hi Tejun,
This is a patch I sent to Jeff few months ago. As you asked, I am
resending it on top of 3.10-rc2 Linus tree. Jeff has said he
applied this patch, but I am not sure what exactly it means ;)
Also, I am not sure about my reading of the statistics and the
trade-off I identified (below), so this patch is a RFC.
The numbers are taken by running 'if=/dev/sd{a,b,c} of=/dev/null'
All time values is in us.
Before this update host lock average holdtime was 3.266532061 and
average waittime was 0.009832679 [1]. After the update average
holdtime (slightly) rose up to 0.335267418 while average waittime
decreased to 0.000320469 [2]. Which means host lock with local
interrupt disabled is held roughly the same while the average
waittime dropped 30 times.
After this update port events are handled with local interrupts
enabled and compete on individual per-port locks with average
holdtime 1.540987475 and average waittime 0.000714864 [3].
Comparing to [1], ata_scsi_queuecmd() holds port locks 2 times
less and waits for locks 13 times less.
The downside of this change is introduction of a kernel thread
and (supposedly) increased total average time of handling a
AHCI interrupt - at most 1.5 times.
The upside is better access times from ata_scsi_queuecmd() to
port locks and moving port interrupt handling out of the
hardware interrupt context.
Thanks!
Lock usage statistics.
1. ahci_interrupt vs ata_scsi_queuecmd (host->lock)
Test holdtime-total waittime-total acquisitions holdtime-avg waittime-avg
#
01. 22732497.77 93399.89 06393367 3.555637862 0.014608874
02. 20358052.08 52869.72 06454133 3.154265969 0.008191607
03. 20322516.57 54981.40 06459318 3.146232554 0.008511951
04. 18558686.89 39178.05 06469468 2.868657344 0.006055838
05. 19069799.90 31961.00 06455953 2.953831897 0.004950625
06. 23783542.56 97159.79 06387322 3.723554654 0.015211350
07. 23889266.74 102625.45 06386666 3.740491007 0.016068705
08. 19284522.61 32655.91 06450568 2.989585198 0.005062486
----------- -----------
avg 3.266532061 0.009832679
2. ahci_single_irq_intr vs ahci_port_thread_fn (host->lock)
Alexander Gordeev (1):
AHCI: Optimize interrupt processing
drivers/ata/acard-ahci.c | 8 ++---
drivers/ata/ahci.c | 54 ++++++++++++++++++-------------
drivers/ata/ahci.h | 10 +++--
drivers/ata/ahci_platform.c | 3 +-
drivers/ata/libahci.c | 74 +++++++++++++++++++++++++------------------
5 files changed, 85 insertions(+), 64 deletions(-)
--
1.7.7.6
--
Regards,
Alexander Gordeev
[email protected]
Split interrupt service routine into hardware context handler and
threaded context handler. That allows to protect ports with individual
locks rather than with a single host-wide lock, which results in better
parallelism.
Signed-off-by: Alexander Gordeev <[email protected]>
---
drivers/ata/acard-ahci.c | 8 ++---
drivers/ata/ahci.c | 54 ++++++++++++++++++-------------
drivers/ata/ahci.h | 10 +++--
drivers/ata/ahci_platform.c | 3 +-
drivers/ata/libahci.c | 74 +++++++++++++++++++++++++------------------
5 files changed, 85 insertions(+), 64 deletions(-)
diff --git a/drivers/ata/acard-ahci.c b/drivers/ata/acard-ahci.c
index 4e94ba2..e429225 100644
--- a/drivers/ata/acard-ahci.c
+++ b/drivers/ata/acard-ahci.c
@@ -409,7 +409,7 @@ static int acard_ahci_init_one(struct pci_dev *pdev, const struct pci_device_id
struct device *dev = &pdev->dev;
struct ahci_host_priv *hpriv;
struct ata_host *host;
- int n_ports, i, rc;
+ int n_ports, n_msis, i, rc;
VPRINTK("ENTER\n");
@@ -436,8 +436,7 @@ static int acard_ahci_init_one(struct pci_dev *pdev, const struct pci_device_id
return -ENOMEM;
hpriv->flags |= (unsigned long)pi.private_data;
- if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
- pci_enable_msi(pdev);
+ n_msis = ahci_init_interrupts(pdev, hpriv);
hpriv->mmio = pcim_iomap_table(pdev)[AHCI_PCI_BAR];
@@ -499,8 +498,7 @@ static int acard_ahci_init_one(struct pci_dev *pdev, const struct pci_device_id
acard_ahci_pci_print_info(host);
pci_set_master(pdev);
- return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
- &acard_ahci_sht);
+ return ahci_host_activate(host, pdev->irq, n_msis, &acard_ahci_sht);
}
module_pci_driver(acard_ahci_pci_driver);
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 251e57d..e4d915f 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1110,14 +1110,14 @@ int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv)
pci_intx(pdev, 1);
return 0;
}
+EXPORT_SYMBOL_GPL(ahci_init_interrupts);
/**
* ahci_host_activate - start AHCI host, request IRQs and register it
* @host: target ATA host
* @irq: base IRQ number to request
* @n_msis: number of MSIs allocated for this host
- * @irq_handler: irq_handler used when requesting IRQs
- * @irq_flags: irq_flags used when requesting IRQs
+ * @sht: scsi_host_template to use when registering the host
*
* Similar to ata_host_activate, but requests IRQs according to AHCI-1.1
* when multiple MSIs were allocated. That is one MSI per port, starting
@@ -1129,43 +1129,59 @@ int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv)
* RETURNS:
* 0 on success, -errno otherwise.
*/
-int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis)
+int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis,
+ struct scsi_host_template *sht)
{
int i, rc;
-
- /* Sharing Last Message among several ports is not supported */
- if (n_msis < host->n_ports)
- return -EINVAL;
+ unsigned int n_irqs;
rc = ata_host_start(host);
if (rc)
return rc;
- for (i = 0; i < host->n_ports; i++) {
- rc = devm_request_threaded_irq(host->dev,
- irq + i, ahci_hw_interrupt, ahci_thread_fn, IRQF_SHARED,
- dev_driver_string(host->dev), host->ports[i]);
+ n_irqs = min(host->n_ports, n_msis);
+ n_irqs = max(n_irqs, 1u);
+
+ if (n_irqs > 1) {
+ /* Sharing Last Message among several ports is not supported */
+ if (n_irqs < host->n_ports)
+ return -EINVAL;
+
+ for (i = 0; i < n_irqs; i++) {
+ rc = devm_request_threaded_irq(host->dev, irq + i,
+ ahci_multi_irqs_intr, ahci_port_thread_fn,
+ IRQF_SHARED, dev_driver_string(host->dev),
+ host->ports[i]);
+ if (rc)
+ goto out_free_irqs;
+ }
+ } else {
+ rc = devm_request_threaded_irq(host->dev, irq,
+ ahci_single_irq_intr, ahci_thread_fn, IRQF_SHARED,
+ dev_driver_string(host->dev), host);
if (rc)
- goto out_free_irqs;
+ goto out;
}
- for (i = 0; i < host->n_ports; i++)
+ for (i = 0; i < n_irqs; i++)
ata_port_desc(host->ports[i], "irq %d", irq + i);
- rc = ata_host_register(host, &ahci_sht);
+ rc = ata_host_register(host, sht);
if (rc)
goto out_free_all_irqs;
return 0;
out_free_all_irqs:
- i = host->n_ports;
+ i = n_irqs;
out_free_irqs:
for (i--; i >= 0; i--)
devm_free_irq(host->dev, irq + i, host->ports[i]);
+out:
return rc;
}
+EXPORT_SYMBOL_GPL(ahci_host_activate);
static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
@@ -1265,8 +1281,6 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
n_msis = ahci_init_interrupts(pdev, hpriv);
- if (n_msis > 1)
- hpriv->flags |= AHCI_HFLAG_MULTI_MSI;
/* save initial config */
ahci_pci_save_initial_config(pdev, hpriv);
@@ -1364,11 +1378,7 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
pci_set_master(pdev);
- if (hpriv->flags & AHCI_HFLAG_MULTI_MSI)
- return ahci_host_activate(host, pdev->irq, n_msis);
-
- return ata_host_activate(host, pdev->irq, ahci_interrupt, IRQF_SHARED,
- &ahci_sht);
+ return ahci_host_activate(host, pdev->irq, n_msis, &ahci_sht);
}
module_pci_driver(ahci_pci_driver);
diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index b830e6c..ed1fbc8 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -231,7 +231,6 @@ enum {
AHCI_HFLAG_DELAY_ENGINE = (1 << 15), /* do not start engine on
port start (wait until
error-handling stage) */
- AHCI_HFLAG_MULTI_MSI = (1 << 16), /* multiple PCI MSIs */
/* ap->flags bits */
@@ -361,11 +360,14 @@ int ahci_port_resume(struct ata_port *ap);
void ahci_set_em_messages(struct ahci_host_priv *hpriv,
struct ata_port_info *pi);
int ahci_reset_em(struct ata_host *host);
-irqreturn_t ahci_interrupt(int irq, void *dev_instance);
-irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance);
+irqreturn_t ahci_single_irq_intr(int irq, void *dev_instance);
+irqreturn_t ahci_multi_irqs_intr(int irq, void *dev_instance);
irqreturn_t ahci_thread_fn(int irq, void *dev_instance);
+irqreturn_t ahci_port_thread_fn(int irq, void *dev_instance);
void ahci_print_info(struct ata_host *host, const char *scc_s);
-int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis);
+int ahci_host_activate(struct ata_host *host, int irq, unsigned int n_msis,
+ struct scsi_host_template *sht);
+int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv);
static inline void __iomem *__ahci_port_base(struct ata_host *host,
unsigned int port_no)
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 7a8a284..b10cd87 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -211,8 +211,7 @@ static int ahci_probe(struct platform_device *pdev)
ahci_init_controller(host);
ahci_print_info(host, "platform");
- rc = ata_host_activate(host, irq, ahci_interrupt, IRQF_SHARED,
- &ahci_platform_sht);
+ rc = ahci_host_activate(host, irq, 0, &ahci_platform_sht);
if (rc)
goto pdata_exit;
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 34c8216..68b3bdd 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -1655,9 +1655,9 @@ static void ahci_error_intr(struct ata_port *ap, u32 irq_stat)
ata_port_abort(ap);
}
-static void ahci_handle_port_interrupt(struct ata_port *ap,
- void __iomem *port_mmio, u32 status)
+static void ahci_handle_port_interrupt(struct ata_port *ap, u32 status)
{
+ void __iomem *port_mmio = ahci_port_base(ap);
struct ata_eh_info *ehi = &ap->link.eh_info;
struct ahci_port_priv *pp = ap->private_data;
struct ahci_host_priv *hpriv = ap->host->private_data;
@@ -1740,22 +1740,10 @@ static void ahci_handle_port_interrupt(struct ata_port *ap,
}
}
-void ahci_port_intr(struct ata_port *ap)
-{
- void __iomem *port_mmio = ahci_port_base(ap);
- u32 status;
-
- status = readl(port_mmio + PORT_IRQ_STAT);
- writel(status, port_mmio + PORT_IRQ_STAT);
-
- ahci_handle_port_interrupt(ap, port_mmio, status);
-}
-
-irqreturn_t ahci_thread_fn(int irq, void *dev_instance)
+irqreturn_t ahci_port_thread_fn(int irq, void *dev_instance)
{
struct ata_port *ap = dev_instance;
struct ahci_port_priv *pp = ap->private_data;
- void __iomem *port_mmio = ahci_port_base(ap);
unsigned long flags;
u32 status;
@@ -1766,14 +1754,43 @@ irqreturn_t ahci_thread_fn(int irq, void *dev_instance)
spin_unlock_irqrestore(&ap->host->lock, flags);
spin_lock_bh(ap->lock);
- ahci_handle_port_interrupt(ap, port_mmio, status);
+ ahci_handle_port_interrupt(ap, status);
spin_unlock_bh(ap->lock);
return IRQ_HANDLED;
}
+EXPORT_SYMBOL_GPL(ahci_port_thread_fn);
+
+irqreturn_t ahci_thread_fn(int irq, void *dev_instance)
+{
+ struct ata_host *host = dev_instance;
+ struct ahci_host_priv *hpriv = host->private_data;
+ u32 irq_masked = hpriv->port_map;
+ unsigned int i;
+
+ for (i = 0; i < host->n_ports; i++) {
+ struct ata_port *ap;
+
+ if (!(irq_masked & (1 << i)))
+ continue;
+
+ ap = host->ports[i];
+ if (ap) {
+ ahci_port_thread_fn(irq, ap);
+ VPRINTK("port %u\n", i);
+ } else {
+ VPRINTK("port %u (no irq)\n", i);
+ if (ata_ratelimit())
+ dev_warn(host->dev,
+ "interrupt on disabled port %u\n", i);
+ }
+ }
+
+ return IRQ_HANDLED;
+}
EXPORT_SYMBOL_GPL(ahci_thread_fn);
-void ahci_hw_port_interrupt(struct ata_port *ap)
+void ahci_update_intr_status(struct ata_port *ap)
{
void __iomem *port_mmio = ahci_port_base(ap);
struct ahci_port_priv *pp = ap->private_data;
@@ -1785,7 +1802,7 @@ void ahci_hw_port_interrupt(struct ata_port *ap)
pp->intr_status |= status;
}
-irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance)
+irqreturn_t ahci_multi_irqs_intr(int irq, void *dev_instance)
{
struct ata_port *ap_this = dev_instance;
struct ahci_port_priv *pp = ap_this->private_data;
@@ -1821,7 +1838,7 @@ irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance)
ap = host->ports[i];
if (ap) {
- ahci_hw_port_interrupt(ap);
+ ahci_update_intr_status(ap);
VPRINTK("port %u\n", i);
} else {
VPRINTK("port %u (no irq)\n", i);
@@ -1839,9 +1856,9 @@ irqreturn_t ahci_hw_interrupt(int irq, void *dev_instance)
return IRQ_WAKE_THREAD;
}
-EXPORT_SYMBOL_GPL(ahci_hw_interrupt);
+EXPORT_SYMBOL_GPL(ahci_multi_irqs_intr);
-irqreturn_t ahci_interrupt(int irq, void *dev_instance)
+irqreturn_t ahci_single_irq_intr(int irq, void *dev_instance)
{
struct ata_host *host = dev_instance;
struct ahci_host_priv *hpriv;
@@ -1871,7 +1888,7 @@ irqreturn_t ahci_interrupt(int irq, void *dev_instance)
ap = host->ports[i];
if (ap) {
- ahci_port_intr(ap);
+ ahci_update_intr_status(ap);
VPRINTK("port %u\n", i);
} else {
VPRINTK("port %u (no irq)\n", i);
@@ -1898,9 +1915,9 @@ irqreturn_t ahci_interrupt(int irq, void *dev_instance)
VPRINTK("EXIT\n");
- return IRQ_RETVAL(handled);
+ return handled ? IRQ_WAKE_THREAD : IRQ_NONE;
}
-EXPORT_SYMBOL_GPL(ahci_interrupt);
+EXPORT_SYMBOL_GPL(ahci_single_irq_intr);
static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc)
{
@@ -2294,13 +2311,8 @@ static int ahci_port_start(struct ata_port *ap)
*/
pp->intr_mask = DEF_PORT_IRQ;
- /*
- * Switch to per-port locking in case each port has its own MSI vector.
- */
- if ((hpriv->flags & AHCI_HFLAG_MULTI_MSI)) {
- spin_lock_init(&pp->lock);
- ap->lock = &pp->lock;
- }
+ spin_lock_init(&pp->lock);
+ ap->lock = &pp->lock;
ap->private_data = pp;
--
1.7.7.6
--
Regards,
Alexander Gordeev
[email protected]
Hello, Alexander.
(cc'ing Jens and Nicholas, hey guys)
On Tue, May 21, 2013 at 09:00:27PM +0200, Alexander Gordeev wrote:
> Before this update host lock average holdtime was 3.266532061 and
> average waittime was 0.009832679 [1]. After the update average
> holdtime (slightly) rose up to 0.335267418 while average waittime
> decreased to 0.000320469 [2]. Which means host lock with local
> interrupt disabled is held roughly the same while the average
> waittime dropped 30 times.
>
> After this update port events are handled with local interrupts
> enabled and compete on individual per-port locks with average
> holdtime 1.540987475 and average waittime 0.000714864 [3].
> Comparing to [1], ata_scsi_queuecmd() holds port locks 2 times
> less and waits for locks 13 times less.
Hmmmmmm..... I'd normally apply this patch but block layer is just
growing multi-queue support and libata is likely to be converted to mq
in foreseeable future, so I'm a bit hesitant to make irq handling more
sophiscated right now. Would you be interested in looking into
converting libata to blk mq support? I'm pretty sure it'd yield far
better outcome if done properly.
Thanks.
--
tejun
On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> Hmmmmmm..... I'd normally apply this patch but block layer is just
> growing multi-queue support and libata is likely to be converted to mq
> in foreseeable future, so I'm a bit hesitant to make irq handling more
> sophiscated right now. Would you be interested in looking into
> converting libata to blk mq support? I'm pretty sure it'd yield far
> better outcome if done properly.
I am not committing, but will look into it, sure.
> Thanks.
>
> --
> tejun
--
Regards,
Alexander Gordeev
[email protected]
On Wed, May 22 2013, Alexander Gordeev wrote:
> On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> > Hmmmmmm..... I'd normally apply this patch but block layer is just
> > growing multi-queue support and libata is likely to be converted to mq
> > in foreseeable future, so I'm a bit hesitant to make irq handling more
> > sophiscated right now. Would you be interested in looking into
> > converting libata to blk mq support? I'm pretty sure it'd yield far
> > better outcome if done properly.
>
> I am not committing, but will look into it, sure.
Would be most awesome, I'm sure Nic would not mind a bit of help on the
SCSI/libata side :-)
And personally, can't wait to run it on the laptop! That's right, I
alpha test on the laptop.
--
Jens Axboe
On Wed, May 22, 2013 at 07:03:05PM +0200, Jens Axboe wrote:
> On Wed, May 22 2013, Alexander Gordeev wrote:
> > On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> > > Hmmmmmm..... I'd normally apply this patch but block layer is just
> > > growing multi-queue support and libata is likely to be converted to mq
> > > in foreseeable future, so I'm a bit hesitant to make irq handling more
> > > sophiscated right now. Would you be interested in looking into
> > > converting libata to blk mq support? I'm pretty sure it'd yield far
> > > better outcome if done properly.
> >
> > I am not committing, but will look into it, sure.
>
> Would be most awesome, I'm sure Nic would not mind a bit of help on the
> SCSI/libata side :-)
Hi Nicholas,
Could you please clarify the status of SCSI MQ support? Is it usable now?
I tried git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git,
but it does not appear working without (at least) changes below to SCSI lib:
Thanks!
diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
index ca6ff67..d8cc7a4 100644
--- a/drivers/scsi/scsi-mq.c
+++ b/drivers/scsi/scsi-mq.c
@@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
static struct blk_mq_ops scsi_mq_ops = {
.queue_rq = scsi_mq_queue_rq,
.map_queue = blk_mq_map_queue,
+ .timeout = scsi_times_out,
.alloc_hctx = blk_mq_alloc_single_hw_queue,
.free_hctx = blk_mq_free_single_hw_queue,
};
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 65360db..33aa373 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -283,7 +283,10 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
/*
* head injection *required* here otherwise quiesce won't work
*/
- blk_execute_rq(req->q, NULL, req, 1);
+ if (q->mq_ops)
+ blk_mq_execute_rq(req->q, req);
+ else
+ blk_execute_rq(req->q, NULL, req, 1);
/*
* Some devices (USB mass-storage in particular) may transfer
@@ -298,12 +301,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
*resid = req->resid_len;
ret = req->errors;
out:
- if (q->mq_ops) {
- printk("scsi_execute(): Calling blk_mq_free_request >>>\n");
- blk_mq_free_request(req);
- } else {
+ if (!q->mq_ops)
blk_put_request(req);
- }
return ret;
}
> And personally, can't wait to run it on the laptop! That's right, I
> alpha test on the laptop.
>
> --
> Jens Axboe
>
--
Regards,
Alexander Gordeev
[email protected]
On Thu, 2013-07-11 at 12:26 +0200, Alexander Gordeev wrote:
> On Wed, May 22, 2013 at 07:03:05PM +0200, Jens Axboe wrote:
> > On Wed, May 22 2013, Alexander Gordeev wrote:
> > > On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> > > > Hmmmmmm..... I'd normally apply this patch but block layer is just
> > > > growing multi-queue support and libata is likely to be converted to mq
> > > > in foreseeable future, so I'm a bit hesitant to make irq handling more
> > > > sophiscated right now. Would you be interested in looking into
> > > > converting libata to blk mq support? I'm pretty sure it'd yield far
> > > > better outcome if done properly.
> > >
> > > I am not committing, but will look into it, sure.
> >
> > Would be most awesome, I'm sure Nic would not mind a bit of help on the
> > SCSI/libata side :-)
>
> Hi Nicholas,
>
> Could you please clarify the status of SCSI MQ support? Is it usable now?
>
Hi Alexander,
Thanks for taking a look. I've not made further progress in the last
weeks on scsi-mq, but am still using virtio-scsi + scsi-mq <->
vhost-scsi + per-cpu-ida for quite a bit for benchmarking purposes.
> I tried git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git,
> but it does not appear working without (at least) changes below to SCSI lib:
>
The only scsi-mq LLD conversions so far have been to virtio-scsi +
scsi_debug to nop REQ_TYPE_FS.
Just so I understand, your patch below is required in order to make what
LLD function with scsi-mq..?
Thanks!
--nab
> Thanks!
>
> diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
> index ca6ff67..d8cc7a4 100644
> --- a/drivers/scsi/scsi-mq.c
> +++ b/drivers/scsi/scsi-mq.c
> @@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
> static struct blk_mq_ops scsi_mq_ops = {
> .queue_rq = scsi_mq_queue_rq,
> .map_queue = blk_mq_map_queue,
> + .timeout = scsi_times_out,
> .alloc_hctx = blk_mq_alloc_single_hw_queue,
> .free_hctx = blk_mq_free_single_hw_queue,
> };
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 65360db..33aa373 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -283,7 +283,10 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> /*
> * head injection *required* here otherwise quiesce won't work
> */
> - blk_execute_rq(req->q, NULL, req, 1);
> + if (q->mq_ops)
> + blk_mq_execute_rq(req->q, req);
> + else
> + blk_execute_rq(req->q, NULL, req, 1);
>
> /*
> * Some devices (USB mass-storage in particular) may transfer
> @@ -298,12 +301,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> *resid = req->resid_len;
> ret = req->errors;
> out:
> - if (q->mq_ops) {
> - printk("scsi_execute(): Calling blk_mq_free_request >>>\n");
> - blk_mq_free_request(req);
> - } else {
> + if (!q->mq_ops)
> blk_put_request(req);
> - }
>
> return ret;
> }
>
>
> > And personally, can't wait to run it on the laptop! That's right, I
> > alpha test on the laptop.
> >
> > --
> > Jens Axboe
> >
>
On Thu, Jul 11, 2013 at 04:00:37PM -0700, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-11 at 12:26 +0200, Alexander Gordeev wrote:
> > On Wed, May 22, 2013 at 07:03:05PM +0200, Jens Axboe wrote:
> > > On Wed, May 22 2013, Alexander Gordeev wrote:
> > > > On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> > > > > Hmmmmmm..... I'd normally apply this patch but block layer is just
> > > > > growing multi-queue support and libata is likely to be converted to mq
> > > > > in foreseeable future, so I'm a bit hesitant to make irq handling more
> > > > > sophiscated right now. Would you be interested in looking into
> > > > > converting libata to blk mq support? I'm pretty sure it'd yield far
> > > > > better outcome if done properly.
> > > >
> > > > I am not committing, but will look into it, sure.
> > >
> > > Would be most awesome, I'm sure Nic would not mind a bit of help on the
> > > SCSI/libata side :-)
> >
> > Hi Nicholas,
> >
> > Could you please clarify the status of SCSI MQ support? Is it usable now?
> >
>
> Hi Alexander,
>
> Thanks for taking a look. I've not made further progress in the last
> weeks on scsi-mq, but am still using virtio-scsi + scsi-mq <->
> vhost-scsi + per-cpu-ida for quite a bit for benchmarking purposes.
Thanks for the clarification, Nicholas.
> > I tried git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git,
> > but it does not appear working without (at least) changes below to SCSI lib:
> >
>
> The only scsi-mq LLD conversions so far have been to virtio-scsi +
> scsi_debug to nop REQ_TYPE_FS.
I see. Do you think the changes I made is a right way to go?
I had to make it to avoid a NULL-pointer assignent and make BIO bounce
buffers work, but I do not really understand the mixture of old and
new code ( neither in fact :) )
> Just so I understand, your patch below is required in order to make what
> LLD function with scsi-mq..?
ata_piix
>
> Thanks!
>
> --nab
>
> > Thanks!
> >
> > diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
> > index ca6ff67..d8cc7a4 100644
> > --- a/drivers/scsi/scsi-mq.c
> > +++ b/drivers/scsi/scsi-mq.c
> > @@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
> > static struct blk_mq_ops scsi_mq_ops = {
> > .queue_rq = scsi_mq_queue_rq,
> > .map_queue = blk_mq_map_queue,
> > + .timeout = scsi_times_out,
> > .alloc_hctx = blk_mq_alloc_single_hw_queue,
> > .free_hctx = blk_mq_free_single_hw_queue,
> > };
> > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > index 65360db..33aa373 100644
> > --- a/drivers/scsi/scsi_lib.c
> > +++ b/drivers/scsi/scsi_lib.c
> > @@ -283,7 +283,10 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> > /*
> > * head injection *required* here otherwise quiesce won't work
> > */
> > - blk_execute_rq(req->q, NULL, req, 1);
> > + if (q->mq_ops)
> > + blk_mq_execute_rq(req->q, req);
> > + else
> > + blk_execute_rq(req->q, NULL, req, 1);
> >
> > /*
> > * Some devices (USB mass-storage in particular) may transfer
> > @@ -298,12 +301,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> > *resid = req->resid_len;
> > ret = req->errors;
> > out:
> > - if (q->mq_ops) {
> > - printk("scsi_execute(): Calling blk_mq_free_request >>>\n");
> > - blk_mq_free_request(req);
> > - } else {
> > + if (!q->mq_ops)
> > blk_put_request(req);
> > - }
> >
> > return ret;
> > }
> >
> >
> > > And personally, can't wait to run it on the laptop! That's right, I
> > > alpha test on the laptop.
> > >
> > > --
> > > Jens Axboe
> > >
> >
>
>
--
Regards,
Alexander Gordeev
[email protected]
On Fri, 2013-07-12 at 09:46 +0200, Alexander Gordeev wrote:
> On Thu, Jul 11, 2013 at 04:00:37PM -0700, Nicholas A. Bellinger wrote:
> > On Thu, 2013-07-11 at 12:26 +0200, Alexander Gordeev wrote:
> > > On Wed, May 22, 2013 at 07:03:05PM +0200, Jens Axboe wrote:
> > > > On Wed, May 22 2013, Alexander Gordeev wrote:
> > > > > On Wed, May 22, 2013 at 08:50:03AM +0900, Tejun Heo wrote:
> > > > > > Hmmmmmm..... I'd normally apply this patch but block layer is just
> > > > > > growing multi-queue support and libata is likely to be converted to mq
> > > > > > in foreseeable future, so I'm a bit hesitant to make irq handling more
> > > > > > sophiscated right now. Would you be interested in looking into
> > > > > > converting libata to blk mq support? I'm pretty sure it'd yield far
> > > > > > better outcome if done properly.
> > > > >
> > > > > I am not committing, but will look into it, sure.
> > > >
> > > > Would be most awesome, I'm sure Nic would not mind a bit of help on the
> > > > SCSI/libata side :-)
> > >
> > > Hi Nicholas,
> > >
> > > Could you please clarify the status of SCSI MQ support? Is it usable now?
> > >
> >
> > Hi Alexander,
> >
> > Thanks for taking a look. I've not made further progress in the last
> > weeks on scsi-mq, but am still using virtio-scsi + scsi-mq <->
> > vhost-scsi + per-cpu-ida for quite a bit for benchmarking purposes.
>
> Thanks for the clarification, Nicholas.
>
> > > I tried git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git,
> > > but it does not appear working without (at least) changes below to SCSI lib:
> > >
> >
> > The only scsi-mq LLD conversions so far have been to virtio-scsi +
> > scsi_debug to nop REQ_TYPE_FS.
>
> I see. Do you think the changes I made is a right way to go?
>
> I had to make it to avoid a NULL-pointer assignent and make BIO bounce
> buffers work, but I do not really understand the mixture of old and
> new code ( neither in fact :) )
>
Hi Alexander,
Comments below.
> > Just so I understand, your patch below is required in order to make what
> > LLD function with scsi-mq..?
>
> ata_piix
>
<nod>
> >
> > Thanks!
> >
> > --nab
> >
> > > Thanks!
> > >
> > > diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
> > > index ca6ff67..d8cc7a4 100644
> > > --- a/drivers/scsi/scsi-mq.c
> > > +++ b/drivers/scsi/scsi-mq.c
> > > @@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
> > > static struct blk_mq_ops scsi_mq_ops = {
> > > .queue_rq = scsi_mq_queue_rq,
> > > .map_queue = blk_mq_map_queue,
> > > + .timeout = scsi_times_out,
> > > .alloc_hctx = blk_mq_alloc_single_hw_queue,
> > > .free_hctx = blk_mq_free_single_hw_queue,
> > > };
So your actually triggering a blk-mq timeout with ata_piix..?
> > > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> > > index 65360db..33aa373 100644
> > > --- a/drivers/scsi/scsi_lib.c
> > > +++ b/drivers/scsi/scsi_lib.c
> > > @@ -283,7 +283,10 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> > > /*
> > > * head injection *required* here otherwise quiesce won't work
> > > */
> > > - blk_execute_rq(req->q, NULL, req, 1);
> > > + if (q->mq_ops)
> > > + blk_mq_execute_rq(req->q, req);
> > > + else
> > > + blk_execute_rq(req->q, NULL, req, 1);
> > >
The scsi_execute() -> REQ_TYPE_BLOCK_RQ special case (scsi_scan +
scsi_ioctl) has a small issue preventing it's conversion to use
blk_mq_execute_rq().
Namely that scsi_execute_cmd() currently expects to be able to check for
the existence of req->errors + req->resid_len *after*
blk_mq_execute_rq() -> blk_mq_finish_request() -> blk_mq_free_request()
has been called to mark the pre-allocated struct request as free for
blk-mq re-use.
That is why scsi-mq still uses blk_execute_rq() for this reason, and
this will need be addressed in order to safely use blk_mq_execute_rq()
in the above context.
> > > /*
> > > * Some devices (USB mass-storage in particular) may transfer
> > > @@ -298,12 +301,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
> > > *resid = req->resid_len;
> > > ret = req->errors;
> > > out:
> > > - if (q->mq_ops) {
> > > - printk("scsi_execute(): Calling blk_mq_free_request >>>\n");
> > > - blk_mq_free_request(req);
> > > - } else {
> > > + if (!q->mq_ops)
> > > blk_put_request(req);
> > > - }
> > >
Do you have an OOPs backtrace handy to post w/o the last two changes in
place..?
Also, just a heads up that so far I've been using IDE (/dev/hdX) for the
root-device in order to make scsi-mq debugging safer and easier.
I *very* much recommend doing the same if at all possible for ata_piix
scsi-mq development + testing, as you'll want to be very careful when
using a real file-system on top of this early alpha code.
--nab
On Fri, Jul 12, 2013 at 10:20:12PM -0700, Nicholas A. Bellinger wrote:
> On Fri, 2013-07-12 at 09:46 +0200, Alexander Gordeev wrote:
> > > > diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
> > > > index ca6ff67..d8cc7a4 100644
> > > > --- a/drivers/scsi/scsi-mq.c
> > > > +++ b/drivers/scsi/scsi-mq.c
> > > > @@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
> > > > static struct blk_mq_ops scsi_mq_ops = {
> > > > .queue_rq = scsi_mq_queue_rq,
> > > > .map_queue = blk_mq_map_queue,
> > > > + .timeout = scsi_times_out,
> > > > .alloc_hctx = blk_mq_alloc_single_hw_queue,
> > > > .free_hctx = blk_mq_free_single_hw_queue,
> > > > };
>
> So your actually triggering a blk-mq timeout with ata_piix..?
No.
That is to avoid a NULL-pointer assignment from ->timeout elsewhere.
In fact I return -ENODEV for sr_probe() to not hit it.
> That is why scsi-mq still uses blk_execute_rq() for this reason, and
> this will need be addressed in order to safely use blk_mq_execute_rq()
> in the above context.
Got it.
> Do you have an OOPs backtrace handy to post w/o the last two changes in
> place..?
Attaching the output. No oops actually (due to aforementioned .timeout).
> I *very* much recommend doing the same if at all possible for ata_piix
> scsi-mq development + testing, as you'll want to be very careful when
> using a real file-system on top of this early alpha code.
Thank you for the warning.
Getting to writing CDBs would be of success :)
> --nab
>
--
Regards,
Alexander Gordeev
[email protected]
On Tue, 2013-07-16 at 20:32 +0200, Alexander Gordeev wrote:
> On Fri, Jul 12, 2013 at 10:20:12PM -0700, Nicholas A. Bellinger wrote:
> > On Fri, 2013-07-12 at 09:46 +0200, Alexander Gordeev wrote:
> > > > > diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
> > > > > index ca6ff67..d8cc7a4 100644
> > > > > --- a/drivers/scsi/scsi-mq.c
> > > > > +++ b/drivers/scsi/scsi-mq.c
> > > > > @@ -155,6 +155,7 @@ void scsi_mq_done(struct scsi_cmnd *sc)
> > > > > static struct blk_mq_ops scsi_mq_ops = {
> > > > > .queue_rq = scsi_mq_queue_rq,
> > > > > .map_queue = blk_mq_map_queue,
> > > > > + .timeout = scsi_times_out,
> > > > > .alloc_hctx = blk_mq_alloc_single_hw_queue,
> > > > > .free_hctx = blk_mq_free_single_hw_queue,
> > > > > };
> >
> > So your actually triggering a blk-mq timeout with ata_piix..?
>
> No.
> That is to avoid a NULL-pointer assignment from ->timeout elsewhere.
> In fact I return -ENODEV for sr_probe() to not hit it.
>
> > That is why scsi-mq still uses blk_execute_rq() for this reason, and
> > this will need be addressed in order to safely use blk_mq_execute_rq()
> > in the above context.
>
> Got it.
>
> > Do you have an OOPs backtrace handy to post w/o the last two changes in
> > place..?
>
> Attaching the output. No oops actually (due to aforementioned .timeout).
>
Hi Alexander,
Thanks for the logs. I'm In-lining some of the output here for
reference:
[ 7.927596] Calling blk_mq_init_queue: scsi_mq_ops: ffffffff81ca13e0, queue_depth: 64, cmd_size: 296 SCSI cmd_size: 0
Just FYI, a SCSI cmd_size of zero here means that scsi-mq will not be
providing pre-allocated LLD descriptors (located at scsi_cmnd->SCp.ptr)
for use by libata driver code.
That is fine for initial testing, but libata will eventually want to
take advantage of scsi_host_template->cmd_size = sizeof(ata_queued_cmd)
in order to remove (all) memory allocations from the I/O fast-path.
[ 7.927639] blk-mq: CPU -> queue map
[ 7.927640] CPU 0 -> Queue 0
[ 7.927640] CPU 1 -> Queue 0
[ 7.927640] CPU 2 -> Queue 0
[ 7.927641] CPU 3 -> Queue 0
[ 7.927641] CPU 4 -> Queue 0
[ 7.927642] CPU 5 -> Queue 0
[ 7.927642] CPU 6 -> Queue 0
[ 7.927643] CPU 7 -> Queue 0
[ 7.927643] CPU 8 -> Queue 0
[ 7.927643] CPU 9 -> Queue 0
[ 7.927644] CPU10 -> Queue 0
[ 7.927644] CPU11 -> Queue 0
[ 7.927645] CPU12 -> Queue 0
[ 7.927645] CPU13 -> Queue 0
[ 7.927646] CPU14 -> Queue 0
[ 7.927646] CPU15 -> Queue 0
[ 7.927673] Performing sc map setup on q: ffff880462430000 hctx: ffff880462a14200 i: 0
[ 7.927780] scsi_mq_alloc_queue() complete !! >>>>>>>>>>>>>>>>>>>>>>>>>>>
[ 7.927784] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927785] Allocated blk-mq req: ffff88046244ad40, req->tag: 63
[ 7.927790] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927803] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927815] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927816] Allocated blk-mq req: ffff88046244ad40, req->tag: 63
[ 7.927817] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
OK, so INQUIRY response payload is looking as expected here.
[ 7.927944] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927946] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927946] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927949] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927951] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927952] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927955] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927958] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
[ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
[ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
[ 7.927966] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927967] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927968] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927970] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927970] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927971] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927972] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927973] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927975] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927976] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927977] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927979] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927981] sd 0:0:0:0: [sda] Write Protect is off
[ 7.927982] sd 0:0:0:0: [sda] Mode Sense: 00 00 00 00
Ditto here..
[ 7.927983] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.927984] Allocated blk-mq req: ffff88046244a7c0, req->tag: 61
[ 7.927985] sd 0:0:0:0: Attached scsi generic sg0 type 0
[ 7.927986] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.927987] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.927989] sd 0:0:0:0: [sda] Asking for cache data failed
[ 7.927990] sd 0:0:0:0: [sda] Assuming drive cache: write through
and here as well..
Not sure why yet some control CDBs are getting back the expected
payload, while others are returning zeros..
Also, looking at the included stack back-trace:
[ 7.928394] blk-mq: CPU -> queue map
[ 7.928394] CPU 0 -> Queue 0
[ 7.928395] CPU 1 -> Queue 0
[ 7.928395] CPU 2 -> Queue 0
[ 7.928396] CPU 3 -> Queue 0
[ 7.928396] CPU 4 -> Queue 0
[ 7.928396] CPU 5 -> Queue 0
[ 7.928397] CPU 6 -> Queue 0
[ 7.928397] CPU 7 -> Queue 0
[ 7.928398] CPU 8 -> Queue 0
[ 7.928398] CPU 9 -> Queue 0
[ 7.928399] CPU10 -> Queue 0
[ 7.928399] CPU11 -> Queue 0
[ 7.928399] CPU12 -> Queue 0
[ 7.928400] CPU13 -> Queue 0
[ 7.928400] CPU14 -> Queue 0
[ 7.928401] CPU15 -> Queue 0
[ 7.928420] Performing sc map setup on q: ffff8804624f08e0 hctx: ffff880462938a00 i: 0
[ 7.928435] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928436] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928437] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928439] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928441] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928441] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928443] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928444] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928446] sd 0:0:1:0: [sdb] Sector size 0 reported, assuming 512.
[ 7.928448] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928448] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928450] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928451] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928452] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928452] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928457] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928458] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928459] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928460] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928461] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928462] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928463] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928464] Allocated blk-mq req: ffff88046250a240, req->tag: 59
[ 7.928465] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928466] scsi_execute(): Calling blk_mq_free_request >>>
[ 7.928468] sd 0:0:1:0: [sdb] Asking for cache data failed
[ 7.928469] sd 0:0:1:0: [sdb] Assuming drive cache: write through
[ 7.928487] ------------[ cut here ]------------
[ 7.928492] WARNING: at drivers/ata/libata-core.c:5038 ata_qc_issue+0x266/0x380()
Here is the code in question:
void ata_qc_issue(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_link *link = qc->dev->link;
u8 prot = qc->tf.protocol;
/* Make sure only one non-NCQ command is outstanding. The
* check is skipped for old EH because it reuses active qc to
* request ATAPI sense.
*/
WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag));
....
}
So I think that ata_tag_valid() is triggering this WARN_ON_ONCE due to
the default queue_depth setting in scsi-mq.c:scsi_mq_alloc_queue(),
which is queueing more than a single outstanding struct scsi_cmnd at a
time into the underlying LLD. Note in this value is currently
hard-coded to:
sdev->sdev_mq_reg.queue_depth = 64;
This value should actually be coming from what the hardware is
advertising, eg:
min(scsi_host_template->cmd_per_lun, scsi_host_template->can_queue)
but I'd recommend to try to hardcode this value to 1 for the moment in
order to match what ata_piix is reporting to SCSI.
Also, just to be safe, please also disable scsi-generic
(CONFIG_CHR_DEV_SG) in your kernel config, as it's not hooked up to
scsi-mq just yet, and may be causing problems elsewhere.
Thanks!
--nab
[ 7.928494] Modules linked in:
[ 7.928496] CPU: 9 PID: 153 Comm: kworker/u50:5 Not tainted 3.10.0-rc5.nab+ #11
[ 7.928497] Hardware name: Cisco Systems Inc R210-2121605W/R210-2121605W, BIOS C200.1.4.3j.0.020720132258 02/07/2013
[ 7.928502] Workqueue: events_unbound async_run_entry_fn
[ 7.928505] 0000000000000009 ffff88046241f588 ffffffff8162cc58 ffff88046241f5c8
[ 7.928507] ffffffff8104a200 ffff88046241f5d8 ffff880462b90000 0000000000000003
[ 7.928509] ffff880462b91c68 ffffffff81415450 ffff880462b90230 ffff88046241f5d8
[ 7.928510] Call Trace:
[ 7.928514] [<ffffffff8162cc58>] dump_stack+0x19/0x1b
[ 7.928518] [<ffffffff8104a200>] warn_slowpath_common+0x70/0xa0
[ 7.928521] [<ffffffff81415450>] ? ata_scsi_set_sense.constprop.26+0x30/0x30
[ 7.928523] [<ffffffff8104a24a>] warn_slowpath_null+0x1a/0x20
[ 7.928525] [<ffffffff8140f586>] ata_qc_issue+0x266/0x380
[ 7.928526] [<ffffffff814155b3>] ? ata_scsi_rw_xlat+0x163/0x210
[ 7.928528] [<ffffffff81415450>] ? ata_scsi_set_sense.constprop.26+0x30/0x30
[ 7.928530] [<ffffffff814140d7>] ata_scsi_translate+0xa7/0x180
[ 7.928531] scsi_mq_alloc_queue() complete !! >>>>>>>>>>>>>>>>>>>>>>>>>>>
[ 7.928533] [<ffffffff814181f9>] ata_scsi_queuecmd+0xa9/0x2b0
[ 7.928534] Entering scsi_execute with q->mq_ops: ffffffff81ca13e0
[ 7.928537] [<ffffffff813ed956>] scsi_dispatch_cmd+0x1c6/0x310
[ 7.928537] Allocated blk-mq req: ffff88046259ad40, req->tag: 63
[ 7.928540] [<ffffffff813f63bb>] scsi_mq_queue_rq+0x17b/0x280
[ 7.928541] Calling blk_mq_insert_request from blk_execute_rq_nowait >>>>>>>>>>>>>>>>
[ 7.928546] [<ffffffff812c15c5>] __blk_mq_run_hw_queue+0x1b5/0x3a0
[ 7.928549] [<ffffffff812c1c65>] blk_mq_run_hw_queue+0x35/0x40
[ 7.928550] [<ffffffff812c1fbb>] blk_mq_make_request+0x34b/0x4a0
[ 7.928554] [<ffffffff812b74f2>] generic_make_request+0xc2/0x110
[ 7.928556] [<ffffffff812b7a1b>] submit_bio+0x7b/0x160
[ 7.928560] [<ffffffff811baa8d>] ? bio_alloc_bioset+0x9d/0x1b0
[ 7.928562] [<ffffffff811b576e>] _submit_bh+0x13e/0x200
[ 7.928564] [<ffffffff811b5840>] submit_bh+0x10/0x20
[ 7.928566] [<ffffffff811b754d>] block_read_full_page+0x21d/0x350
[ 7.928568] [<ffffffff811bbff0>] ? I_BDEV+0x10/0x10
[ 7.928571] [<ffffffff8113be73>] ? __inc_zone_page_state+0x33/0x40
[ 7.928573] [<ffffffff8111f4bf>] ? add_to_page_cache_locked+0xdf/0x190
[ 7.928575] [<ffffffff811bc4a0>] ? blkdev_write_begin+0x30/0x30
[ 7.928577] [<ffffffff811bc4b8>] blkdev_readpage+0x18/0x20
[ 7.928579] [<ffffffff8111fcfa>] do_read_cache_page+0x7a/0x170
[ 7.928581] [<ffffffff8112837a>] ? __alloc_pages_nodemask+0x17a/0xad0
[ 7.928583] [<ffffffff8111fe09>] read_cache_page_async+0x19/0x20
[ 7.928585] [<ffffffff8112015e>] read_cache_page+0xe/0x20
[ 7.928588] [<ffffffff812c80cd>] read_dev_sector+0x2d/0x90
[ 7.928590] [<ffffffff812cdc4c>] read_lba+0xec/0x190
[ 7.928592] [<ffffffff812ce255>] ? efi_partition+0xe5/0x5f0
[ 7.928594] [<ffffffff812ce26f>] efi_partition+0xff/0x5f0
[ 7.928596] [<ffffffff812e9c34>] ? snprintf+0x34/0x40
[ 7.928598] [<ffffffff812ce170>] ? is_gpt_valid+0x480/0x480
[ 7.928600] [<ffffffff812c9138>] check_partition+0x108/0x220
[ 7.928602] [<ffffffff812c8d44>] rescan_partitions+0xb4/0x2c0
[ 7.928604] [<ffffffff811bda35>] __blkdev_get+0x375/0x4b0
[ 7.928606] [<ffffffff8119e447>] ? inode_init_always+0x107/0x1c0
[ 7.928608] [<ffffffff811bc010>] ? blkdev_get_block+0x20/0x20
[ 7.928610] [<ffffffff811bdd05>] blkdev_get+0x195/0x2e0
[ 7.928612] [<ffffffff8119f0c7>] ? unlock_new_inode+0x47/0x70
[ 7.928613] [<ffffffff811bcff0>] ? bdget+0x120/0x140
[ 7.928615] [<ffffffff812c6531>] add_disk+0x391/0x490
[ 7.928618] [<ffffffff81402c4a>] sd_probe_async+0x13a/0x230
[ 7.928620] [<ffffffff81075de6>] async_run_entry_fn+0x46/0x140
[ 7.928623] [<ffffffff810683c4>] process_one_work+0x174/0x400
[ 7.928624] [<ffffffff81068acc>] worker_thread+0x11c/0x370
[ 7.928626] [<ffffffff810689b0>] ? rescuer_thread+0x320/0x320
[ 7.928629] [<ffffffff8106f410>] kthread+0xc0/0xd0
[ 7.928631] [<ffffffff8106f350>] ? flush_kthread_worker+0x80/0x80
[ 7.928633] [<ffffffff8163b2dc>] ret_from_fork+0x7c/0xb0
[ 7.928635] [<ffffffff8106f350>] ? flush_kthread_worker+0x80/0x80
[ 7.928638] ---[ end trace 9f4b3fe3fb787a07 ]---
> > I *very* much recommend doing the same if at all possible for ata_piix
> > scsi-mq development + testing, as you'll want to be very careful when
> > using a real file-system on top of this early alpha code.
>
> Thank you for the warning.
> Getting to writing CDBs would be of success :)
>
> > --nab
> >
>
On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
>
> OK, so INQUIRY response payload is looking as expected here.
Yep. It is not on the top of my head, but I remember something like INQUIRYs
are emulated and thus do not have payload.
> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
>
> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
Yep. Because blk_execute_rq() does not put the proper callback and data do
not get copied from sg's to bounce buffer. That is why I tried to use
blk_mq_execute_rq() instead. Once I do that, data start getting read and
booting stops elsewhere.
Of course, I was suspecting that change alone is not valid and wondered
about the status of scsi-mq in the first place, and if more changes are
coming.
So I it turns out "req->errors + req->resid_len" issue (you described
earlier) needs to be addressed before going forward with libata (only?).
> Not sure why yet some control CDBs are getting back the expected
> payload, while others are returning zeros..
Bio buffers do not get updated from callback.
> Also, looking at the included stack back-trace:
[...]
Thanks a lot for these and other your comments, Nicholas!
--
Regards,
Alexander Gordeev
[email protected]
On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> > [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> > [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
> >
> > OK, so INQUIRY response payload is looking as expected here.
>
> Yep. It is not on the top of my head, but I remember something like INQUIRYs
> are emulated and thus do not have payload.
>
> > [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> > [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> > [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
> >
> > Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
>
> Yep. Because blk_execute_rq() does not put the proper callback and data do
> not get copied from sg's to bounce buffer. That is why I tried to use
> blk_mq_execute_rq() instead. Once I do that, data start getting read and
> booting stops elsewhere.
Mmmmmm.
The call to blk_queue_bounce() exists within blk_mq_make_request(), but
AFAICT this should still be getting invoked regardless of if the struct
request is dispatched into blk-mq via the modified blk_execute_rq() ->
blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
via blk_mq_execute_rq()..
Jens..?
>
> Of course, I was suspecting that change alone is not valid and wondered
> about the status of scsi-mq in the first place, and if more changes are
> coming.
Most certainly. ;)
>
> So I it turns out "req->errors + req->resid_len" issue (you described
> earlier) needs to be addressed before going forward with libata (only?).
AFAICT, getting an initial conversion of libata up does not depend upon
this specific issue being addressed first. I could be wrong however..
>
> > Not sure why yet some control CDBs are getting back the expected
> > payload, while others are returning zeros..
>
> Bio buffers do not get updated from callback.
>
> > Also, looking at the included stack back-trace:
>
> [...]
>
> Thanks a lot for these and other your comments, Nicholas!
>
Sure. I should have a few extra cycles to hack on this over the
weekend.
Also, thinking about this some more, trying to convert ahci to scsi-mq
first (using QEMU emulation), while keeping a rootfs on PIIX_IDE might
make debugging slightly easier..
--nab
On Thu, 2013-07-18 at 11:51 -0700, Nicholas A. Bellinger wrote:
> On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
> > On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> > > [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> > > [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
> > >
> > > OK, so INQUIRY response payload is looking as expected here.
> >
> > Yep. It is not on the top of my head, but I remember something like INQUIRYs
> > are emulated and thus do not have payload.
> >
> > > [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> > > [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> > > [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
> > >
> > > Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
> >
> > Yep. Because blk_execute_rq() does not put the proper callback and data do
> > not get copied from sg's to bounce buffer. That is why I tried to use
> > blk_mq_execute_rq() instead. Once I do that, data start getting read and
> > booting stops elsewhere.
>
> Mmmmmm.
>
> The call to blk_queue_bounce() exists within blk_mq_make_request(), but
> AFAICT this should still be getting invoked regardless of if the struct
> request is dispatched into blk-mq via the modified blk_execute_rq() ->
> blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
> via blk_mq_execute_rq()..
>
> Jens..?
>
Actually sorry, your right. A call to blk_mq_insert_request() for
REQ_TYPE_BLOCK_PC will not invoke blk_queue_bounce() located near the
top of blk_mq_execute_rq(), which means that only REQ_TYPE_FS is
currently using bounce buffers, if required.
Need to think a bit more about what to do here for REQ_TYPE_BLOCK_PC
bounce buffer special case with blk_execute_rq(), but I'm thinking that
blk_mq_execute_rq() should really not be used here..
Jens..?
--nab
On 07/18/2013 12:51 PM, Nicholas A. Bellinger wrote:
> On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
>> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
>>> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
>>> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
>>>
>>> OK, so INQUIRY response payload is looking as expected here.
>>
>> Yep. It is not on the top of my head, but I remember something like INQUIRYs
>> are emulated and thus do not have payload.
>>
>>> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
>>> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
>>> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
>>>
>>> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
>>
>> Yep. Because blk_execute_rq() does not put the proper callback and data do
>> not get copied from sg's to bounce buffer. That is why I tried to use
>> blk_mq_execute_rq() instead. Once I do that, data start getting read and
>> booting stops elsewhere.
>
> Mmmmmm.
>
> The call to blk_queue_bounce() exists within blk_mq_make_request(), but
> AFAICT this should still be getting invoked regardless of if the struct
> request is dispatched into blk-mq via the modified blk_execute_rq() ->
> blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
> via blk_mq_execute_rq()..
>
blk_mq_make_request is not called from the blk insert/execute paths.
blk_mq_make_request takes a bio and tries to merge it with a request and
adds it to the queue. It is only called when the make_request_fn is
called like when generic_make_request is called.
blk_mq_insert_request adds a already formed request to the queue. It is
already formed so that is why that path does not bounce bios. The
bios/pages should already be added within the drivers restrictions. So
for the read_cap path, the call to blk_rq_map_kern in scsi_execute does
the blk_queue_bounce call.
Just saw this while trying out iscsi with the scsi-mq stuff :)
On 07/18/2013 01:14 PM, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-18 at 11:51 -0700, Nicholas A. Bellinger wrote:
>> On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
>>> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
>>>> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
>>>> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
>>>>
>>>> OK, so INQUIRY response payload is looking as expected here.
>>>
>>> Yep. It is not on the top of my head, but I remember something like INQUIRYs
>>> are emulated and thus do not have payload.
>>>
>>>> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
>>>> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
>>>> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
>>>>
>>>> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
>>>
>>> Yep. Because blk_execute_rq() does not put the proper callback and data do
>>> not get copied from sg's to bounce buffer. That is why I tried to use
>>> blk_mq_execute_rq() instead. Once I do that, data start getting read and
>>> booting stops elsewhere.
>>
>> Mmmmmm.
>>
>> The call to blk_queue_bounce() exists within blk_mq_make_request(), but
>> AFAICT this should still be getting invoked regardless of if the struct
>> request is dispatched into blk-mq via the modified blk_execute_rq() ->
>> blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
>> via blk_mq_execute_rq()..
>>
>> Jens..?
>>
>
> Actually sorry, your right. A call to blk_mq_insert_request() for
> REQ_TYPE_BLOCK_PC will not invoke blk_queue_bounce() located near the
> top of blk_mq_execute_rq(), which means that only REQ_TYPE_FS is
> currently using bounce buffers, if required.
>
> Need to think a bit more about what to do here for REQ_TYPE_BLOCK_PC
> bounce buffer special case with blk_execute_rq(), but I'm thinking that
> blk_mq_execute_rq() should really not be used here..
>
> Jens..?
It needs to be pre-bounced, blk-mq will only bounce incoming bios and
not requests merely added to the queue(s). Might be useful to add an
equiv blk_mq_make_request() for this.
--
Jens Axboe
On Thu, 2013-07-18 at 13:12 -0600, Mike Christie wrote:
> On 07/18/2013 12:51 PM, Nicholas A. Bellinger wrote:
> > On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
> >> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> >>> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> >>> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
> >>>
> >>> OK, so INQUIRY response payload is looking as expected here.
> >>
> >> Yep. It is not on the top of my head, but I remember something like INQUIRYs
> >> are emulated and thus do not have payload.
> >>
> >>> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> >>> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> >>> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
> >>>
> >>> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
> >>
> >> Yep. Because blk_execute_rq() does not put the proper callback and data do
> >> not get copied from sg's to bounce buffer. That is why I tried to use
> >> blk_mq_execute_rq() instead. Once I do that, data start getting read and
> >> booting stops elsewhere.
> >
> > Mmmmmm.
> >
> > The call to blk_queue_bounce() exists within blk_mq_make_request(), but
> > AFAICT this should still be getting invoked regardless of if the struct
> > request is dispatched into blk-mq via the modified blk_execute_rq() ->
> > blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
> > via blk_mq_execute_rq()..
> >
>
> blk_mq_make_request is not called from the blk insert/execute paths.
> blk_mq_make_request takes a bio and tries to merge it with a request and
> adds it to the queue. It is only called when the make_request_fn is
> called like when generic_make_request is called.
>
> blk_mq_insert_request adds a already formed request to the queue. It is
> already formed so that is why that path does not bounce bios. The
> bios/pages should already be added within the drivers restrictions. So
> for the read_cap path, the call to blk_rq_map_kern in scsi_execute does
> the blk_queue_bounce call.
>
<nod>, just noticed the blk_queue_bounce() in blk_rq_map_kern().
Not sure why this doesn't seem to be doing what it's supposed to for
libata just yet..
> Just saw this while trying out iscsi with the scsi-mq stuff :)
>
Took at stab at this a while back, but ended getting distracted on other
items. Do you have an initial conversion running yet..?
--nab
On Thu, Jul 18 2013, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-18 at 13:12 -0600, Mike Christie wrote:
> > On 07/18/2013 12:51 PM, Nicholas A. Bellinger wrote:
> > > On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
> > >> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> > >>> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> > >>> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
> > >>>
> > >>> OK, so INQUIRY response payload is looking as expected here.
> > >>
> > >> Yep. It is not on the top of my head, but I remember something like INQUIRYs
> > >> are emulated and thus do not have payload.
> > >>
> > >>> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> > >>> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> > >>> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
> > >>>
> > >>> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
> > >>
> > >> Yep. Because blk_execute_rq() does not put the proper callback and data do
> > >> not get copied from sg's to bounce buffer. That is why I tried to use
> > >> blk_mq_execute_rq() instead. Once I do that, data start getting read and
> > >> booting stops elsewhere.
> > >
> > > Mmmmmm.
> > >
> > > The call to blk_queue_bounce() exists within blk_mq_make_request(), but
> > > AFAICT this should still be getting invoked regardless of if the struct
> > > request is dispatched into blk-mq via the modified blk_execute_rq() ->
> > > blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
> > > via blk_mq_execute_rq()..
> > >
> >
> > blk_mq_make_request is not called from the blk insert/execute paths.
> > blk_mq_make_request takes a bio and tries to merge it with a request and
> > adds it to the queue. It is only called when the make_request_fn is
> > called like when generic_make_request is called.
> >
> > blk_mq_insert_request adds a already formed request to the queue. It is
> > already formed so that is why that path does not bounce bios. The
> > bios/pages should already be added within the drivers restrictions. So
> > for the read_cap path, the call to blk_rq_map_kern in scsi_execute does
> > the blk_queue_bounce call.
> >
>
> <nod>, just noticed the blk_queue_bounce() in blk_rq_map_kern().
>
> Not sure why this doesn't seem to be doing what it's supposed to for
> libata just yet..
How are you make the request from the bio? It'd be pretty trivial to
ensure that it gets bounced properly... blk_mq_execute_rq() assumes a
fully complete request, so it wont bounce anything.
--
Jens Axboe
On Thu, 2013-07-18 at 18:30 -0600, Jens Axboe wrote:
> On Thu, Jul 18 2013, Nicholas A. Bellinger wrote:
> > On Thu, 2013-07-18 at 13:12 -0600, Mike Christie wrote:
> > > On 07/18/2013 12:51 PM, Nicholas A. Bellinger wrote:
> > > > On Wed, 2013-07-17 at 18:19 +0200, Alexander Gordeev wrote:
> > > >> On Tue, Jul 16, 2013 at 02:38:03PM -0700, Nicholas A. Bellinger wrote:
> > > >>> [ 7.927818] scsi_execute(): Calling blk_mq_free_request >>>
> > > >>> [ 7.927826] scsi 0:0:0:0: Direct-Access ATA ST9500530NS CC03 PQ: 0 ANSI: 5
> > > >>>
> > > >>> OK, so INQUIRY response payload is looking as expected here.
> > > >>
> > > >> Yep. It is not on the top of my head, but I remember something like INQUIRYs
> > > >> are emulated and thus do not have payload.
> > > >>
> > > >>> [ 7.927960] sd 0:0:0:0: [sda] Sector size 0 reported, assuming 512.
> > > >>> [ 7.927964] sd 0:0:0:0: [sda] 1 512-byte logical blocks: (512 B/512 B)
> > > >>> [ 7.927965] sd 0:0:0:0: [sda] 0-byte physical blocks
> > > >>>
> > > >>> Strange.. READ_CAPACITY appears to be returning a payload as zeros..?
> > > >>
> > > >> Yep. Because blk_execute_rq() does not put the proper callback and data do
> > > >> not get copied from sg's to bounce buffer. That is why I tried to use
> > > >> blk_mq_execute_rq() instead. Once I do that, data start getting read and
> > > >> booting stops elsewhere.
> > > >
> > > > Mmmmmm.
> > > >
> > > > The call to blk_queue_bounce() exists within blk_mq_make_request(), but
> > > > AFAICT this should still be getting invoked regardless of if the struct
> > > > request is dispatched into blk-mq via the modified blk_execute_rq() ->
> > > > blk_execute_rq_nowait() -> blk_mq_insert_request() codepath, or directly
> > > > via blk_mq_execute_rq()..
> > > >
> > >
> > > blk_mq_make_request is not called from the blk insert/execute paths.
> > > blk_mq_make_request takes a bio and tries to merge it with a request and
> > > adds it to the queue. It is only called when the make_request_fn is
> > > called like when generic_make_request is called.
> > >
> > > blk_mq_insert_request adds a already formed request to the queue. It is
> > > already formed so that is why that path does not bounce bios. The
> > > bios/pages should already be added within the drivers restrictions. So
> > > for the read_cap path, the call to blk_rq_map_kern in scsi_execute does
> > > the blk_queue_bounce call.
> > >
> >
> > <nod>, just noticed the blk_queue_bounce() in blk_rq_map_kern().
> >
> > Not sure why this doesn't seem to be doing what it's supposed to for
> > libata just yet..
>
> How are you make the request from the bio? It'd be pretty trivial to
> ensure that it gets bounced properly... blk_mq_execute_rq() assumes a
> fully complete request, so it wont bounce anything.
>
>From what I gather for REQ_TYPE_BLOCK_PC, scsi_execute() ->
blk_rq_map_kern() -> blk_rq_append_bio() -> blk_rq_bio_prep() is what
does the request setup from the bios returned by bio_[copy,map]_kern()
in blk_rq_map_kern() code.
blk_queue_bounce() is called immediately after blk_rq_append_bio() here,
which AFAICT looks like it's doing the correct thing for scsi-mq..
What is strange here is that libata-scsi.c CDB emulation code is doing
the same stuff for both INQUIRY (that seems to be OK) and READ_CAPACITY
(that is returning zeros), which makes me think that something else is
going on..
Alexander, where you able to re-test using sdev->sdev_mq_reg.queue_depth
= 1 in scsi_mq_alloc_queue()..?
--nab
On Thu, 2013-07-18 at 18:03 -0700, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-18 at 18:30 -0600, Jens Axboe wrote:
> > On Thu, Jul 18 2013, Nicholas A. Bellinger wrote:
> > > On Thu, 2013-07-18 at 13:12 -0600, Mike Christie wrote:
> > > > On 07/18/2013 12:51 PM, Nicholas A. Bellinger wrote:
<SNIP>
> > > <nod>, just noticed the blk_queue_bounce() in blk_rq_map_kern().
> > >
> > > Not sure why this doesn't seem to be doing what it's supposed to for
> > > libata just yet..
> >
> > How are you make the request from the bio? It'd be pretty trivial to
> > ensure that it gets bounced properly... blk_mq_execute_rq() assumes a
> > fully complete request, so it wont bounce anything.
> >
>
> From what I gather for REQ_TYPE_BLOCK_PC, scsi_execute() ->
> blk_rq_map_kern() -> blk_rq_append_bio() -> blk_rq_bio_prep() is what
> does the request setup from the bios returned by bio_[copy,map]_kern()
> in blk_rq_map_kern() code.
>
> blk_queue_bounce() is called immediately after blk_rq_append_bio() here,
> which AFAICT looks like it's doing the correct thing for scsi-mq..
>
> What is strange here is that libata-scsi.c CDB emulation code is doing
> the same stuff for both INQUIRY (that seems to be OK) and READ_CAPACITY
> (that is returning zeros), which makes me think that something else is
> going on..
>
> Alexander, where you able to re-test using sdev->sdev_mq_reg.queue_depth
> = 1 in scsi_mq_alloc_queue()..?
>
So after a bit more hacking tonight it appears the explicit setting of
dma_alignment by libata (sdev->sector_size - 1) is what is making
blk_rq_map_kern() invoke blk_copy_kern() instead of blk_map_kern(), and
triggering this scsi-mq specific bug with libata. I'm able to confirm
with QEMU IDE emulation the bug is occurring only after INQUIRY and
before READ_CAPACITY, as reported by Alexander.
Below is a quick hack to skip this setting in ata_scsi_dev_config() for
blk-mq, and leaves the default dma_alignment=0x03 for REQ_TYPE_BLOCK_PC
requests as initially set by scsi-core in scsi_init_request_queue().
Also included is the change for using queue_depth = min(SHT->can_queue,
SHT->cmd_per_lun) during scsi-mq request_queue initialization, along
with a very basic ata_piix conversion for testing purposes.
With these three changes in place, I'm able to register a single 1GB
ata_piix LUN using QEMU IDE emulation, and successfully run simple fio
writeverify tests with blocksize=4k @ queue_depth=1.
Obviously this is not a proper bugfix for unaligned blk_copy_kern() with
scsi-mq + REQ_TYPE_BLOCK_PC case, but should be enough to at least get
libata LUN scanning to work. Need to take a deeper look at the problem
here..
Alexander, care to give this work-around a shot on your bare-metal setup
using ata_piix..?
--nab
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 9a8a674..ac05cd6 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1066,6 +1066,8 @@ static u8 piix_vmw_bmdma_status(struct ata_port *ap)
static struct scsi_host_template piix_sht = {
ATA_BMDMA_SHT(DRV_NAME),
+ .scsi_mq = true,
+ .queuecommand_mq = ata_scsi_queuecmd,
};
static struct ata_port_operations piix_sata_ops = {
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 0101af5..191bc15 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1144,7 +1144,11 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
"sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
sdev->sector_size);
- blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
+ if (!q->mq_ops) {
+ blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
+ } else {
+ printk("Skipping dma_alignment for libata w/ scsi-mq\n");
+ }
if (dev->flags & ATA_DFLAG_AN)
set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
index ca6ff67..81b2633 100644
--- a/drivers/scsi/scsi-mq.c
+++ b/drivers/scsi/scsi-mq.c
@@ -199,11 +199,11 @@ int scsi_mq_alloc_queue(struct Scsi_Host *sh, struct scsi_device *sdev)
int i, j;
sdev->sdev_mq_reg.ops = &scsi_mq_ops;
- sdev->sdev_mq_reg.queue_depth = sdev->queue_depth;
+ sdev->sdev_mq_reg.queue_depth = min((short)sh->hostt->can_queue,
+ sh->hostt->cmd_per_lun);
sdev->sdev_mq_reg.cmd_size = sizeof(struct scsi_cmnd) + sh->hostt->cmd_size;
sdev->sdev_mq_reg.numa_node = NUMA_NO_NODE;
sdev->sdev_mq_reg.nr_hw_queues = 1;
- sdev->sdev_mq_reg.queue_depth = 64;
sdev->sdev_mq_reg.flags = BLK_MQ_F_SHOULD_MERGE;
printk("Calling blk_mq_init_queue: scsi_mq_ops: %p, queue_depth: %d,"
On Thu, 2013-07-18 at 23:34 -0700, Nicholas A. Bellinger wrote:
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 0101af5..191bc15 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -1144,7 +1144,11 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
> "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
> sdev->sector_size);
>
> - blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> + if (!q->mq_ops) {
> + blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> + } else {
> + printk("Skipping dma_alignment for libata w/ scsi-mq\n");
> + }
Amazingly enough there is a reason for the dma alignment, and it wasn't
just to annoy you, so you can't blindly do this.
The email thread is probably lost in the mists of time, but if I
remember correctly the problem is that some ahci DMA controllers barf if
the sector they're doing DMA on crosses a page boundary. Some are
annoying enough to actually cause silent data corruption. You won't
find every ahci DMA controller doing this, so the change will work for
some, but it will be hard to identify those it won't work for until
people start losing data.
The correct fix, obviously, is to do the bio copy on the kernel path for
unaligned data. It is OK to assume that REQ_TYPE_FS data is correctly
aligned (because of the block to page alignment).
James
On 07/18/2013 06:23 PM, Nicholas A. Bellinger wrote:
>> Just saw this while trying out iscsi with the scsi-mq stuff :)
>> >
> Took at stab at this a while back, but ended getting distracted on other
> items. Do you have an initial conversion running yet..?
Not running well :) Have a patch but I am debugging it now. I messed up
something with the cmd_size stuff.
On Fri, 2013-07-19 at 08:33 -0700, James Bottomley wrote:
> On Thu, 2013-07-18 at 23:34 -0700, Nicholas A. Bellinger wrote:
> > diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> > index 0101af5..191bc15 100644
> > --- a/drivers/ata/libata-scsi.c
> > +++ b/drivers/ata/libata-scsi.c
> > @@ -1144,7 +1144,11 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
> > "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
> > sdev->sector_size);
> >
> > - blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> > + if (!q->mq_ops) {
> > + blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> > + } else {
> > + printk("Skipping dma_alignment for libata w/ scsi-mq\n");
> > + }
>
> Amazingly enough there is a reason for the dma alignment, and it wasn't
> just to annoy you, so you can't blindly do this.
>
> The email thread is probably lost in the mists of time, but if I
> remember correctly the problem is that some ahci DMA controllers barf if
> the sector they're doing DMA on crosses a page boundary. Some are
> annoying enough to actually cause silent data corruption. You won't
> find every ahci DMA controller doing this, so the change will work for
> some, but it will be hard to identify those it won't work for until
> people start losing data.
Thanks for the extra background.
So at least from what I gather thus far this shouldn't be an issue for
initial testing with scsi-mq <-> libata w/ ata_piix.
>
> The correct fix, obviously, is to do the bio copy on the kernel path for
> unaligned data. It is OK to assume that REQ_TYPE_FS data is correctly
> aligned (because of the block to page alignment).
>
Indeed. Looking into the bio_copy_kern() breakage next..
--nab
On Fri, 2013-07-19 at 09:58 -0600, Mike Christie wrote:
> On 07/18/2013 06:23 PM, Nicholas A. Bellinger wrote:
> >> Just saw this while trying out iscsi with the scsi-mq stuff :)
> >> >
> > Took at stab at this a while back, but ended getting distracted on other
> > items. Do you have an initial conversion running yet..?
>
> Not running well :) Have a patch but I am debugging it now. I messed up
> something with the cmd_size stuff.
<nod>, looking forward to see ib_iser move beyond the existing
scsi_request_fn() bottleneck. ;)
Feel free to send me the WIP off-list if you'd like another pair of
eyes.
--nab
On Fri, 2013-07-19 at 14:01 -0700, Nicholas A. Bellinger wrote:
> On Fri, 2013-07-19 at 08:33 -0700, James Bottomley wrote:
> > On Thu, 2013-07-18 at 23:34 -0700, Nicholas A. Bellinger wrote:
> > > diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> > > index 0101af5..191bc15 100644
> > > --- a/drivers/ata/libata-scsi.c
> > > +++ b/drivers/ata/libata-scsi.c
> > > @@ -1144,7 +1144,11 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
> > > "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
> > > sdev->sector_size);
> > >
> > > - blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> > > + if (!q->mq_ops) {
> > > + blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
> > > + } else {
> > > + printk("Skipping dma_alignment for libata w/ scsi-mq\n");
> > > + }
> >
> > Amazingly enough there is a reason for the dma alignment, and it wasn't
> > just to annoy you, so you can't blindly do this.
> >
> > The email thread is probably lost in the mists of time, but if I
> > remember correctly the problem is that some ahci DMA controllers barf if
> > the sector they're doing DMA on crosses a page boundary. Some are
> > annoying enough to actually cause silent data corruption. You won't
> > find every ahci DMA controller doing this, so the change will work for
> > some, but it will be hard to identify those it won't work for until
> > people start losing data.
>
> Thanks for the extra background.
>
> So at least from what I gather thus far this shouldn't be an issue for
> initial testing with scsi-mq <-> libata w/ ata_piix.
>
> >
> > The correct fix, obviously, is to do the bio copy on the kernel path for
> > unaligned data. It is OK to assume that REQ_TYPE_FS data is correctly
> > aligned (because of the block to page alignment).
> >
>
> Indeed. Looking into the bio_copy_kern() breakage next..
>
OK, after further investigation the root cause is a actually a missing
bio->bio_end_io() -> bio_copy_kern_endio() -> bio_put() from the
blk_end_sync_rq() callback path that scsi-mq REQ_TYPE_BLOCK_PC is
currently using.
Including the following patch into the scsi-mq working branch now, and
reverting the libata dma_alignment=0x03 hack.
Alexander, care to give this a try..?
--nab
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 0761c89..70303d2 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -25,7 +25,10 @@ static void blk_end_sync_rq(struct request *rq, int error)
struct completion *waiting = rq->end_io_data;
rq->end_io_data = NULL;
- if (!rq->q->mq_ops) {
+ if (rq->q->mq_ops) {
+ if (rq->bio)
+ bio_endio(rq->bio, error);
+ } else {
__blk_put_request(rq->q, rq);
}
On 07/19/2013 11:56 PM, Nicholas A. Bellinger wrote:
> On Fri, 2013-07-19 at 14:01 -0700, Nicholas A. Bellinger wrote:
>> On Fri, 2013-07-19 at 08:33 -0700, James Bottomley wrote:
>>> On Thu, 2013-07-18 at 23:34 -0700, Nicholas A. Bellinger wrote:
>>>> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
>>>> index 0101af5..191bc15 100644
>>>> --- a/drivers/ata/libata-scsi.c
>>>> +++ b/drivers/ata/libata-scsi.c
>>>> @@ -1144,7 +1144,11 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
>>>> "sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
>>>> sdev->sector_size);
>>>>
>>>> - blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
>>>> + if (!q->mq_ops) {
>>>> + blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
>>>> + } else {
>>>> + printk("Skipping dma_alignment for libata w/ scsi-mq\n");
>>>> + }
>>>
>>> Amazingly enough there is a reason for the dma alignment, and it wasn't
>>> just to annoy you, so you can't blindly do this.
>>>
>>> The email thread is probably lost in the mists of time, but if I
>>> remember correctly the problem is that some ahci DMA controllers barf if
>>> the sector they're doing DMA on crosses a page boundary. Some are
>>> annoying enough to actually cause silent data corruption. You won't
>>> find every ahci DMA controller doing this, so the change will work for
>>> some, but it will be hard to identify those it won't work for until
>>> people start losing data.
>>
>> Thanks for the extra background.
>>
>> So at least from what I gather thus far this shouldn't be an issue for
>> initial testing with scsi-mq <-> libata w/ ata_piix.
>>
>>>
>>> The correct fix, obviously, is to do the bio copy on the kernel path for
>>> unaligned data. It is OK to assume that REQ_TYPE_FS data is correctly
>>> aligned (because of the block to page alignment).
>>>
>>
>> Indeed. Looking into the bio_copy_kern() breakage next..
>>
>
> OK, after further investigation the root cause is a actually a missing
> bio->bio_end_io() -> bio_copy_kern_endio() -> bio_put() from the
> blk_end_sync_rq() callback path that scsi-mq REQ_TYPE_BLOCK_PC is
> currently using.
>
> Including the following patch into the scsi-mq working branch now, and
> reverting the libata dma_alignment=0x03 hack.
>
> Alexander, care to give this a try..?
>
> --nab
>
> diff --git a/block/blk-exec.c b/block/blk-exec.c
> index 0761c89..70303d2 100644
> --- a/block/blk-exec.c
> +++ b/block/blk-exec.c
> @@ -25,7 +25,10 @@ static void blk_end_sync_rq(struct request *rq, int error)
> struct completion *waiting = rq->end_io_data;
>
> rq->end_io_data = NULL;
> - if (!rq->q->mq_ops) {
> + if (rq->q->mq_ops) {
> + if (rq->bio)
> + bio_endio(rq->bio, error);
> + } else {
> __blk_put_request(rq->q, rq);
> }
>
This does not handle requests with multiple bios, and for the mq stye
passthrough insertion completions you actually want to call
blk_mq_finish_request in scsi_execute. Same for all the other
passthrough code in your scsi mq tree. That is your root bug. Instead of
doing that though I think we want to have the block layer free the bios
like before.
For the non mq calls, blk_end_request type of calls will complete the
bios when blk_finish_request is called from that path. It will then call
the rq end_io callback.
I think the blk mq code assumes if the end_io callack is set that the
end_io function will do the bio cleanup. See __blk_mq_end_io. Also see
how blk_mq_execute_rq calls blk_mq_finish_request for an example of how
rq passthrough execution and cleanup is being done in the mq paths.
Now with the scsi mq changes, when blk_execute_rq_nowait calls
blk_mq_insert_request it calls it with a old non mq style of end io
function that does not complete the bios.
What about the attached only compile tested patch. The patch has the mq
block code work like the non mq code for bio cleanups.
On Sat, 2013-07-20 at 09:48 -0500, Mike Christie wrote:
> On 07/19/2013 11:56 PM, Nicholas A. Bellinger wrote:
> > On Fri, 2013-07-19 at 14:01 -0700, Nicholas A. Bellinger wrote:
> >> On Fri, 2013-07-19 at 08:33 -0700, James Bottomley wrote:
<SNIP>
> >>
> >> Indeed. Looking into the bio_copy_kern() breakage next..
> >>
> >
> > OK, after further investigation the root cause is a actually a missing
> > bio->bio_end_io() -> bio_copy_kern_endio() -> bio_put() from the
> > blk_end_sync_rq() callback path that scsi-mq REQ_TYPE_BLOCK_PC is
> > currently using.
> >
> > Including the following patch into the scsi-mq working branch now, and
> > reverting the libata dma_alignment=0x03 hack.
> >
> > Alexander, care to give this a try..?
> >
> > --nab
> >
> > diff --git a/block/blk-exec.c b/block/blk-exec.c
> > index 0761c89..70303d2 100644
> > --- a/block/blk-exec.c
> > +++ b/block/blk-exec.c
> > @@ -25,7 +25,10 @@ static void blk_end_sync_rq(struct request *rq, int error)
> > struct completion *waiting = rq->end_io_data;
> >
> > rq->end_io_data = NULL;
> > - if (!rq->q->mq_ops) {
> > + if (rq->q->mq_ops) {
> > + if (rq->bio)
> > + bio_endio(rq->bio, error);
> > + } else {
> > __blk_put_request(rq->q, rq);
> > }
> >
>
>
> This does not handle requests with multiple bios, and for the mq stye
> passthrough insertion completions you actually want to call
> blk_mq_finish_request in scsi_execute. Same for all the other
> passthrough code in your scsi mq tree. That is your root bug. Instead of
> doing that though I think we want to have the block layer free the bios
> like before.
>
> For the non mq calls, blk_end_request type of calls will complete the
> bios when blk_finish_request is called from that path. It will then call
> the rq end_io callback.
>
> I think the blk mq code assumes if the end_io callack is set that the
> end_io function will do the bio cleanup. See __blk_mq_end_io. Also see
> how blk_mq_execute_rq calls blk_mq_finish_request for an example of how
> rq passthrough execution and cleanup is being done in the mq paths.
>
> Now with the scsi mq changes, when blk_execute_rq_nowait calls
> blk_mq_insert_request it calls it with a old non mq style of end io
> function that does not complete the bios.
>
> What about the attached only compile tested patch. The patch has the mq
> block code work like the non mq code for bio cleanups.
>
>
OK, so with your blk-mq patch in place to always call
blk_mq_finish_request() in blk_mq_complete_request() regardless of
rq->end_io, the preceding scsi_mq_end_request() can now simply call
blk_mq_end_io() for both BLOCK_RQ and FS request types.
diff --git a/drivers/scsi/scsi-mq.c b/drivers/scsi/scsi-mq.c
index 81b2633..f1d4789 100644
--- a/drivers/scsi/scsi-mq.c
+++ b/drivers/scsi/scsi-mq.c
@@ -93,19 +93,7 @@ struct scsi_cmnd *scsi_mq_end_request(struct scsi_cmnd *sc, int error,
#endif
//FIXME: Add proper blk_mq_end_io residual bytes + requeue
- if (rq->end_io) {
-#if 0
- printk("scsi_mq_end_request: Calling rq->end_io BLOCK_PC for"
- " CDB: 0x%02x\n", sc->cmnd[0]);
-#endif
- rq->end_io(rq, error);
- } else {
-#if 0
- printk("scsi_mq_end_request: Calling blk_mq_end_io for CDB: 0x%02x\n",
- sc->cmnd[0]);
-#endif
- blk_mq_end_io(rq, error);
- }
+ blk_mq_end_io(rq, error);
//FIXME: Need to do equiv of scsi_next_command to kick hctx..?
return NULL;
Thanks for fixing up that bit of ugliness. ;)
Jens, care to review+include Mike's change into your working branch..?
--nab
On Sat, Jul 20 2013, Mike Christie wrote:
> blk-mq: blk-mq should free bios in pass through case
>
> For non mq calls, the block layer will free the bios when
> blk_finish_request is called.
>
> For mq calls, the blk mq code wants the caller to do this.
>
> This patch has the blk mq code work like the non mq code
> and has the block layer free the bios.
Thanks Mike, looks good, applied.
--
Jens Axboe
On Fri, Jul 19, 2013 at 09:56:02PM -0700, Nicholas A. Bellinger wrote:
> On Fri, 2013-07-19 at 14:01 -0700, Nicholas A. Bellinger wrote:
> OK, after further investigation the root cause is a actually a missing
> bio->bio_end_io() -> bio_copy_kern_endio() -> bio_put() from the
> blk_end_sync_rq() callback path that scsi-mq REQ_TYPE_BLOCK_PC is
> currently using.
Yes, missing bio_copy_kern_endio() callback is exactly the reason I
turned to blk_mq_execute_rq() initially. I should have been more
specific on this :|
I will try Mike's and your other change, hopefully soon (sorry,
constantly getting distracted).
> Including the following patch into the scsi-mq working branch now, and
> reverting the libata dma_alignment=0x03 hack.
--
Regards,
Alexander Gordeev
[email protected]
On Mon, 2013-07-22 at 17:03 +0200, Alexander Gordeev wrote:
> On Fri, Jul 19, 2013 at 09:56:02PM -0700, Nicholas A. Bellinger wrote:
> > On Fri, 2013-07-19 at 14:01 -0700, Nicholas A. Bellinger wrote:
> > OK, after further investigation the root cause is a actually a missing
> > bio->bio_end_io() -> bio_copy_kern_endio() -> bio_put() from the
> > blk_end_sync_rq() callback path that scsi-mq REQ_TYPE_BLOCK_PC is
> > currently using.
>
> Yes, missing bio_copy_kern_endio() callback is exactly the reason I
> turned to blk_mq_execute_rq() initially. I should have been more
> specific on this :|
>
> I will try Mike's and your other change, hopefully soon (sorry,
> constantly getting distracted).
>
Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
target-pending/scsi-mq, which now has functioning scsi-generic support.
Also, your scsi_times_out patch from earlier has not been included just
yet, but that should be the only extra patch you need to apply in order
to get scsi-mq enabled libata/ata_piix running.
--nab
On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> target-pending/scsi-mq, which now has functioning scsi-generic support.
Survives a boot, a kernel build and the build's result :)
--
Regards,
Alexander Gordeev
[email protected]
On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
> On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > target-pending/scsi-mq, which now has functioning scsi-generic support.
>
> Survives a boot, a kernel build and the build's result :)
Great. Thanks for the feedback Alexander!
So the next step on my end is to enable -mq for ahci, and verify initial
correctness using QEMU/KVM hardware emulation.
Btw, I've been looking at enabling the SHT->cmd_size for struct
ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
are already all pre-allocated by libata and obtained via ata_qc_new() ->
__ata_qc_from_tag() during ata_scsi_queuecmd().
So that said, with the struct request + struct scsi_cmnd pre-allocations
already provided by blk-mq -> scsi-mq code, all memory allocations
should have already been eliminated from I/O fast path.
--nab
On Thu, Jul 25 2013, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
> > On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > > target-pending/scsi-mq, which now has functioning scsi-generic support.
> >
> > Survives a boot, a kernel build and the build's result :)
>
> Great. Thanks for the feedback Alexander!
>
> So the next step on my end is to enable -mq for ahci, and verify initial
> correctness using QEMU/KVM hardware emulation.
>
> Btw, I've been looking at enabling the SHT->cmd_size for struct
> ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
> are already all pre-allocated by libata and obtained via ata_qc_new() ->
> __ata_qc_from_tag() during ata_scsi_queuecmd().
Might still not be a bad idea to do it:
- Cleans up a driver, getting rid of the need to alloc, maintain, and
free those structures.
- Should be some cache locality benefits to having it all sequential.
> So that said, with the struct request + struct scsi_cmnd pre-allocations
> already provided by blk-mq -> scsi-mq code, all memory allocations
> should have already been eliminated from I/O fast path.
Nice!
--
Jens Axboe
On Thu, 2013-07-25 at 20:09 -0600, Jens Axboe wrote:
> On Thu, Jul 25 2013, Nicholas A. Bellinger wrote:
> > On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
> > > On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > > > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > > > target-pending/scsi-mq, which now has functioning scsi-generic support.
> > >
> > > Survives a boot, a kernel build and the build's result :)
> >
> > Great. Thanks for the feedback Alexander!
> >
> > So the next step on my end is to enable -mq for ahci, and verify initial
> > correctness using QEMU/KVM hardware emulation.
> >
> > Btw, I've been looking at enabling the SHT->cmd_size for struct
> > ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
> > are already all pre-allocated by libata and obtained via ata_qc_new() ->
> > __ata_qc_from_tag() during ata_scsi_queuecmd().
>
> Might still not be a bad idea to do it:
>
> - Cleans up a driver, getting rid of the need to alloc, maintain, and
> free those structures.
>
> - Should be some cache locality benefits to having it all sequential.
>
Looking at this some more, there are a number of locations outside of
the main blk_mq_ops->queue_rq() -> SHT->queuecommand_mq() dispatch that
use *ata_qc_from_tag() to obtain *ata_queued_cmd, and a few without a
associated struct scsi_cmnd like libata-core.c:ata_exec_internal_sg()
for example..
So I don't think (completely) getting rid of ata_port->qcmds[] will be
possible, and just converting the ata_scsi_queuecmd() path to use the
extra SHT->cmd_size pre-allocation for *ata_queued_cmd might end up
being more trouble that it's worth. Still undecided on that part..
Tejun, do you have any thoughts + input here..?
--nab
On Fri, 2013-07-26 at 14:14 -0700, Nicholas A. Bellinger wrote:
> On Thu, 2013-07-25 at 20:09 -0600, Jens Axboe wrote:
> > On Thu, Jul 25 2013, Nicholas A. Bellinger wrote:
> > > On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
> > > > On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > > > > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > > > > target-pending/scsi-mq, which now has functioning scsi-generic support.
> > > >
> > > > Survives a boot, a kernel build and the build's result :)
> > >
> > > Great. Thanks for the feedback Alexander!
> > >
> > > So the next step on my end is to enable -mq for ahci, and verify initial
> > > correctness using QEMU/KVM hardware emulation.
> > >
> > > Btw, I've been looking at enabling the SHT->cmd_size for struct
> > > ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
> > > are already all pre-allocated by libata and obtained via ata_qc_new() ->
> > > __ata_qc_from_tag() during ata_scsi_queuecmd().
> >
> > Might still not be a bad idea to do it:
> >
> > - Cleans up a driver, getting rid of the need to alloc, maintain, and
> > free those structures.
> >
> > - Should be some cache locality benefits to having it all sequential.
> >
>
> Looking at this some more, there are a number of locations outside of
> the main blk_mq_ops->queue_rq() -> SHT->queuecommand_mq() dispatch that
> use *ata_qc_from_tag() to obtain *ata_queued_cmd, and a few without a
> associated struct scsi_cmnd like libata-core.c:ata_exec_internal_sg()
> for example..
>
> So I don't think (completely) getting rid of ata_port->qcmds[] will be
> possible, and just converting the ata_scsi_queuecmd() path to use the
> extra SHT->cmd_size pre-allocation for *ata_queued_cmd might end up
> being more trouble that it's worth. Still undecided on that part..
>
> Tejun, do you have any thoughts + input here..?
>
OK, so I decided to give this a shot anyways.. Here is a quick
conversion for libata + AHCI to use blk-mq -> scsi-mq pre-allocation for
ata_queued_cmd descriptors:
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 2b50dfd..61b3db8 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -92,6 +92,9 @@ static int ahci_pci_device_resume(struct pci_dev *pdev);
static struct scsi_host_template ahci_sht = {
AHCI_SHT("ahci"),
+ .scsi_mq = true,
+ .cmd_size = sizeof(struct ata_queued_cmd),
+ .queuecommand_mq = ata_scsi_queuecmd,
};
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index f218427..e21814d 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4725,29 +4725,25 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
/**
* ata_qc_new - Request an available ATA command, for queueing
* @ap: target port
+ * @sc: incoming scsi_cmnd descriptor
*
* LOCKING:
* None.
*/
-static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
+static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap,
+ struct scsi_cmnd *sc)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i;
+ struct request *rq = sc->request;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
- /* the last tag is reserved for internal command. */
- for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
- if (!test_and_set_bit(i, &ap->qc_allocated)) {
- qc = __ata_qc_from_tag(ap, i);
- break;
- }
-
- if (qc)
- qc->tag = i;
+ qc = (struct ata_queued_cmd *)sc->SCp.ptr;
+ qc->scsicmd = sc;
+ qc->tag = rq->tag;
return qc;
}
@@ -4755,19 +4751,20 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
/**
* ata_qc_new_init - Request an available ATA command, and initialize it
* @dev: Device from whom we request an available command structure
+ * @sc: incoming scsi_cmnd descriptor
*
* LOCKING:
* None.
*/
-struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
+struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev,
+ struct scsi_cmnd *sc)
{
struct ata_port *ap = dev->link->ap;
struct ata_queued_cmd *qc;
- qc = ata_qc_new(ap);
+ qc = ata_qc_new(ap, sc);
if (qc) {
- qc->scsicmd = NULL;
qc->ap = ap;
qc->dev = dev;
@@ -4797,10 +4794,9 @@ void ata_qc_free(struct ata_queued_cmd *qc)
qc->flags = 0;
tag = qc->tag;
- if (likely(ata_tag_valid(tag))) {
+
+ if (likely(ata_tag_valid(tag)))
qc->tag = ATA_TAG_POISON;
- clear_bit(tag, &ap->qc_allocated);
- }
}
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 0101af5..e5ab880 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -742,9 +742,8 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
{
struct ata_queued_cmd *qc;
- qc = ata_qc_new_init(dev);
+ qc = ata_qc_new_init(dev, cmd);
if (qc) {
- qc->scsicmd = cmd;
qc->scsidone = cmd->scsi_done;
qc->sg = scsi_sglist(cmd);
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index c949dd3..4cd88af 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -63,7 +63,8 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
extern void ata_force_cbl(struct ata_port *ap);
extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
-extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
+extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev,
+ struct scsi_cmnd *sc);
extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
u64 block, u32 n_block, unsigned int tf_flags,
unsigned int tag);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index eae7a05..52e9e9e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -35,9 +35,13 @@
#include <linux/ata.h>
#include <linux/workqueue.h>
#include <scsi/scsi_host.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_cmnd.h>
#include <linux/acpi.h>
#include <linux/cdrom.h>
#include <linux/sched.h>
+#include <linux/blk-mq.h>
+#include <../../block/blk-mq.h>
/*
* Define if arch has non-standard setup. This is a _PCI_ standard
@@ -1500,9 +1504,26 @@ static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap,
unsigned int tag)
{
- if (likely(ata_tag_valid(tag)))
- return &ap->qcmd[tag];
- return NULL;
+ struct scsi_device *sdev = ap->link.device[0].sdev;
+ struct request_queue *q = sdev->request_queue;
+
+ if (unlikely(!ata_tag_valid(tag)))
+ return NULL;
+
+ if (likely(sdev->host->hostt->scsi_mq)) {
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+ struct request *rq;
+ struct ata_queued_cmd *qc;
+
+ BUG_ON(tag > hctx->queue_depth);
+
+ rq = hctx->rqs[tag];
+ qc = blk_mq_rq_to_pdu(rq) + sizeof(struct scsi_cmnd);
+ blk_mq_put_ctx(ctx);
+ return qc;
+ }
+ return &ap->qcmd[tag];
}
The thing that I'm hung up on now for existing __ata_qc_from_tag() usage
outside of the main blk_mq_ops->queue_rq -> SHT->queuecommand_mq()
dispatch path, is how to actually locate the underlying scsi_device ->
request_queue -> blk_mq_ctx -> blk_mq_hw_hctx from the passed
ata_port..?
Considering there can be more than a single ata_device hanging off each
ata_port, the '*sdev = ap->link.device[0].sdev' in __ata_qc_from_tag()
is definitely bogus, but I'm not sure how else to correlate
blk-mq/scsi-mq per device descriptors to existing code expecting
ata_port->qcmd[] descriptors to be shared across multiple devices..
Tejun..?
--nab
On 07/26/2013 04:09 AM, Jens Axboe wrote:
> On Thu, Jul 25 2013, Nicholas A. Bellinger wrote:
>> On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
>>> On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
>>>> Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
>>>> target-pending/scsi-mq, which now has functioning scsi-generic support.
>>>
>>> Survives a boot, a kernel build and the build's result :)
>>
>> Great. Thanks for the feedback Alexander!
>>
>> So the next step on my end is to enable -mq for ahci, and verify initial
>> correctness using QEMU/KVM hardware emulation.
>>
>> Btw, I've been looking at enabling the SHT->cmd_size for struct
>> ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
>> are already all pre-allocated by libata and obtained via ata_qc_new() ->
>> __ata_qc_from_tag() during ata_scsi_queuecmd().
>
> Might still not be a bad idea to do it:
>
> - Cleans up a driver, getting rid of the need to alloc, maintain, and
> free those structures.
>
> - Should be some cache locality benefits to having it all sequential.
>
>> So that said, with the struct request + struct scsi_cmnd pre-allocations
>> already provided by blk-mq -> scsi-mq code, all memory allocations
>> should have already been eliminated from I/O fast path.
>
> Nice!
>
Hmm.
I'm trying to work out if it would be possible to move multipath
handling over to scsi-mq.
However, when doing so I would need to reconfigure 'nr_hw_queues' on
the fly. Now with all the static cmd preallocation going on this is
going to be tricky.
This leaves me with two choices:
- Tear down the command pool altogether whenever I need to
reconfigure the device (which is going to be painful)
- Allocate some max nr_hw_queues, and mark the superfluous
ones as 'unused' or something. Seeing the a sane max nr_hw_queues
will be possibly the number of cpus this might end up hogging
quite some memory.
Would you accept patches moving the static command allocation over
to pools or is this a desired feature?
Cheers,
Hannes
--
Dr. Hannes Reinecke zSeries & Storage
[email protected] +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: J. Hawn, J. Guild, F. Imend?rffer, HRB 16746 (AG N?rnberg)
On Fri, Jul 26, 2013 at 05:43:13PM -0700, Nicholas A. Bellinger wrote:
> On Fri, 2013-07-26 at 14:14 -0700, Nicholas A. Bellinger wrote:
> > On Thu, 2013-07-25 at 20:09 -0600, Jens Axboe wrote:
> > > On Thu, Jul 25 2013, Nicholas A. Bellinger wrote:
> > > > On Thu, 2013-07-25 at 12:16 +0200, Alexander Gordeev wrote:
> > > > > On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > > > > > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > > > > > target-pending/scsi-mq, which now has functioning scsi-generic support.
> > > > >
> > > > > Survives a boot, a kernel build and the build's result :)
> > > >
> > > > Great. Thanks for the feedback Alexander!
> > > >
> > > > So the next step on my end is to enable -mq for ahci, and verify initial
> > > > correctness using QEMU/KVM hardware emulation.
> > > >
> > > > Btw, I've been looking at enabling the SHT->cmd_size for struct
> > > > ata_queued_cmd descriptor pre-allocation, but AFAICT these descriptors
> > > > are already all pre-allocated by libata and obtained via ata_qc_new() ->
> > > > __ata_qc_from_tag() during ata_scsi_queuecmd().
> > >
> > > Might still not be a bad idea to do it:
> > >
> > > - Cleans up a driver, getting rid of the need to alloc, maintain, and
> > > free those structures.
> > >
> > > - Should be some cache locality benefits to having it all sequential.
> > >
> >
> > Looking at this some more, there are a number of locations outside of
> > the main blk_mq_ops->queue_rq() -> SHT->queuecommand_mq() dispatch that
> > use *ata_qc_from_tag() to obtain *ata_queued_cmd, and a few without a
> > associated struct scsi_cmnd like libata-core.c:ata_exec_internal_sg()
> > for example..
> >
> > So I don't think (completely) getting rid of ata_port->qcmds[] will be
> > possible, and just converting the ata_scsi_queuecmd() path to use the
> > extra SHT->cmd_size pre-allocation for *ata_queued_cmd might end up
> > being more trouble that it's worth. Still undecided on that part..
> >
> > Tejun, do you have any thoughts + input here..?
> >
>
> OK, so I decided to give this a shot anyways.. Here is a quick
> conversion for libata + AHCI to use blk-mq -> scsi-mq pre-allocation for
> ata_queued_cmd descriptors:
>
> diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
> index 2b50dfd..61b3db8 100644
> --- a/drivers/ata/ahci.c
> +++ b/drivers/ata/ahci.c
> @@ -92,6 +92,9 @@ static int ahci_pci_device_resume(struct pci_dev *pdev);
>
> static struct scsi_host_template ahci_sht = {
> AHCI_SHT("ahci"),
> + .scsi_mq = true,
> + .cmd_size = sizeof(struct ata_queued_cmd),
> + .queuecommand_mq = ata_scsi_queuecmd,
> };
> diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
> index f218427..e21814d 100644
> --- a/drivers/ata/libata-core.c
> +++ b/drivers/ata/libata-core.c
> @@ -4725,29 +4725,25 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
> /**
> * ata_qc_new - Request an available ATA command, for queueing
> * @ap: target port
> + * @sc: incoming scsi_cmnd descriptor
> *
> * LOCKING:
> * None.
> */
>
> -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
> +static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap,
> + struct scsi_cmnd *sc)
> {
> struct ata_queued_cmd *qc = NULL;
> - unsigned int i;
> + struct request *rq = sc->request;
>
> /* no command while frozen */
> if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
> return NULL;
>
> - /* the last tag is reserved for internal command. */
> - for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
blk-mq does not prevent tag ATA_TAG_INTERNAL from being using. Would it make
sense to promote queue depth of length (ATA_MAX_QUEUE - 1) while always
pointing ATA_TAG_INTERNAL to qcmd (see below)?
> - if (!test_and_set_bit(i, &ap->qc_allocated)) {
ata_port::qc_allocated becomes redundant.
> - qc = __ata_qc_from_tag(ap, i);
> - break;
> - }
> -
> - if (qc)
> - qc->tag = i;
> + qc = (struct ata_queued_cmd *)sc->SCp.ptr;
> + qc->scsicmd = sc;
> + qc->tag = rq->tag;
>
> return qc;
> }
> @@ -4755,19 +4751,20 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
> /**
> * ata_qc_new_init - Request an available ATA command, and initialize it
> * @dev: Device from whom we request an available command structure
> + * @sc: incoming scsi_cmnd descriptor
> *
> * LOCKING:
> * None.
> */
>
> -struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev)
> +struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev,
> + struct scsi_cmnd *sc)
> {
> struct ata_port *ap = dev->link->ap;
> struct ata_queued_cmd *qc;
>
> - qc = ata_qc_new(ap);
> + qc = ata_qc_new(ap, sc);
> if (qc) {
> - qc->scsicmd = NULL;
> qc->ap = ap;
> qc->dev = dev;
>
> @@ -4797,10 +4794,9 @@ void ata_qc_free(struct ata_queued_cmd *qc)
>
> qc->flags = 0;
> tag = qc->tag;
> - if (likely(ata_tag_valid(tag))) {
> +
> + if (likely(ata_tag_valid(tag)))
> qc->tag = ATA_TAG_POISON;
> - clear_bit(tag, &ap->qc_allocated);
> - }
> }
>
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 0101af5..e5ab880 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -742,9 +742,8 @@ static struct ata_queued_cmd *ata_scsi_qc_new(struct ata_device *dev,
> {
> struct ata_queued_cmd *qc;
>
> - qc = ata_qc_new_init(dev);
> + qc = ata_qc_new_init(dev, cmd);
> if (qc) {
> - qc->scsicmd = cmd;
> qc->scsidone = cmd->scsi_done;
>
> qc->sg = scsi_sglist(cmd);
> diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
> index c949dd3..4cd88af 100644
> --- a/drivers/ata/libata.h
> +++ b/drivers/ata/libata.h
> @@ -63,7 +63,8 @@ extern struct ata_link *ata_dev_phys_link(struct ata_device *dev);
> extern void ata_force_cbl(struct ata_port *ap);
> extern u64 ata_tf_to_lba(const struct ata_taskfile *tf);
> extern u64 ata_tf_to_lba48(const struct ata_taskfile *tf);
> -extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev);
> +extern struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev,
> + struct scsi_cmnd *sc);
> extern int ata_build_rw_tf(struct ata_taskfile *tf, struct ata_device *dev,
> u64 block, u32 n_block, unsigned int tf_flags,
> unsigned int tag);
> diff --git a/include/linux/libata.h b/include/linux/libata.h
> index eae7a05..52e9e9e 100644
> --- a/include/linux/libata.h
> +++ b/include/linux/libata.h
> @@ -35,9 +35,13 @@
> #include <linux/ata.h>
> #include <linux/workqueue.h>
> #include <scsi/scsi_host.h>
> +#include <scsi/scsi_device.h>
> +#include <scsi/scsi_cmnd.h>
> #include <linux/acpi.h>
> #include <linux/cdrom.h>
> #include <linux/sched.h>
> +#include <linux/blk-mq.h>
> +#include <../../block/blk-mq.h>
>
> /*
> * Define if arch has non-standard setup. This is a _PCI_ standard
> @@ -1500,9 +1504,26 @@ static inline void ata_qc_set_polling(struct ata_queued_cmd *qc)
> static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap,
> unsigned int tag)
> {
> - if (likely(ata_tag_valid(tag)))
> - return &ap->qcmd[tag];
> - return NULL;
> + struct scsi_device *sdev = ap->link.device[0].sdev;
> + struct request_queue *q = sdev->request_queue;
> +
> + if (unlikely(!ata_tag_valid(tag)))
> + return NULL;
> +
> + if (likely(sdev->host->hostt->scsi_mq)) {
Together with the comment above:
if (likely(sdev->host->hostt->scsi_mq && (tag != ATA_TAG_INTERNAL))) {
...
> + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
> + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
> + struct request *rq;
> + struct ata_queued_cmd *qc;
> +
> + BUG_ON(tag > hctx->queue_depth);
> +
> + rq = hctx->rqs[tag];
> + qc = blk_mq_rq_to_pdu(rq) + sizeof(struct scsi_cmnd);
> + blk_mq_put_ctx(ctx);
> + return qc;
> + }
> + return &ap->qcmd[tag];
> }
I also tried to make a "quick" conversion and hit the same issue(s) as you.
Generally, I am concerned with these assumptions in such approach:
1. While libata concept of tags matches nicely with blk-mq (blk_mq_hw_ctx::
rqs[] vs ata_port::qcmd[]) right now, it is too exposed to changes in blk-mq
in the long run. I.e. ata_link::sactive limits tags to indices, while tags
might become hashes. Easily fixable, but still.
2. Unallocated requests in blk-mq are accessed/analized from libata-eh.c as
result of such iterations:
for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
qc = __ata_qc_from_tag(ap, tag);
if (!(qc->flags & ATA_QCFLAG_FAILED))
continue;
...
}
While it is probably okay right now, it is still based on a premise that
blk-mq will not change the contents/concept of "payload", i.e. from embedded
to (re-)allocated memory.
> The thing that I'm hung up on now for existing __ata_qc_from_tag() usage
> outside of the main blk_mq_ops->queue_rq -> SHT->queuecommand_mq()
> dispatch path, is how to actually locate the underlying scsi_device ->
> request_queue -> blk_mq_ctx -> blk_mq_hw_hctx from the passed
> ata_port..?
I am actually in favor of getting rid of ata_queued_cmd::tag. Converting
ata_link::sactive to a list, making ata_link::active_tag as struct
ata_queued_cmd *ata_link::active_qc and converting ata_port::qc_allocated to a
list seems solves it all, including [2]. Have not checked it though.
Anyway, if we need a blk-mq tag (why?), we have qc->scsicmd->request->tag.
> Considering there can be more than a single ata_device hanging off each
> ata_port, the '*sdev = ap->link.device[0].sdev' in __ata_qc_from_tag()
> is definitely bogus, but I'm not sure how else to correlate
> blk-mq/scsi-mq per device descriptors to existing code expecting
> ata_port->qcmd[] descriptors to be shared across multiple devices..
>
> Tejun..?
>
> --nab
>
--
Regards,
Alexander Gordeev
[email protected]
Hello,
On Fri, Jul 26, 2013 at 02:14:36PM -0700, Nicholas A. Bellinger wrote:
> So I don't think (completely) getting rid of ata_port->qcmds[] will be
> possible, and just converting the ata_scsi_queuecmd() path to use the
> extra SHT->cmd_size pre-allocation for *ata_queued_cmd might end up
> being more trouble that it's worth. Still undecided on that part..
>
> Tejun, do you have any thoughts + input here..?
libata exception handling which includes probing doesn't go through
SCSI at all. It all works inside libata proper using ata_queuecmds
and only the result is exposed to SCSI. Most of those SCSI semantics
need to be emulated anyway, so this makes things a lot easier than
going through SCSI for each command. As it currently stands, it'd be
a lot of effort to try to embed ata_qc's into higher layer construct.
Given how it's used, I don't think it's a high priority task.
One thing which would probably be worthwhile tho is getting rid of the
bitmap based qc tag allocator in libata. That one is just borderline
stupid to keep around on any setup which is supposed to be scalable.
Thanks.
--
tejun
Yo,
On Fri, Jul 26, 2013 at 05:43:13PM -0700, Nicholas A. Bellinger wrote:
> Considering there can be more than a single ata_device hanging off each
> ata_port, the '*sdev = ap->link.device[0].sdev' in __ata_qc_from_tag()
> is definitely bogus, but I'm not sure how else to correlate
> blk-mq/scsi-mq per device descriptors to existing code expecting
> ata_port->qcmd[] descriptors to be shared across multiple devices..
>
> Tejun..?
I have no idea. Let's please just do simpler conversion and worry
about embedding qc's into scsi_cmnds later. libata isn't a normal
SCSI driver and has a rather its own thick midlayer doing the
impedance matching inbetween && I really don't think there is too much
benefit to be reaped from embedding qc's into scsi_cmnds.
Thanks.
--
tejun
On 07/29/2013 05:46 AM, Tejun Heo wrote:
> Hello,
>
> On Fri, Jul 26, 2013 at 02:14:36PM -0700, Nicholas A. Bellinger wrote:
>> So I don't think (completely) getting rid of ata_port->qcmds[] will be
>> possible, and just converting the ata_scsi_queuecmd() path to use the
>> extra SHT->cmd_size pre-allocation for *ata_queued_cmd might end up
>> being more trouble that it's worth. Still undecided on that part..
>>
>> Tejun, do you have any thoughts + input here..?
>
> libata exception handling which includes probing doesn't go through
> SCSI at all. It all works inside libata proper using ata_queuecmds
> and only the result is exposed to SCSI. Most of those SCSI semantics
> need to be emulated anyway, so this makes things a lot easier than
> going through SCSI for each command. As it currently stands, it'd be
> a lot of effort to try to embed ata_qc's into higher layer construct.
> Given how it's used, I don't think it's a high priority task.
>
> One thing which would probably be worthwhile tho is getting rid of the
> bitmap based qc tag allocator in libata. That one is just borderline
> stupid to keep around on any setup which is supposed to be scalable.
Your border might be wider than mine :-). Yes, the bitmap should
definitely go.
--
Jens Axboe
On 07/29/2013 05:18 AM, Alexander Gordeev wrote:
>> -static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
>> +static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap,
>> + struct scsi_cmnd *sc)
>> {
>> struct ata_queued_cmd *qc = NULL;
>> - unsigned int i;
>> + struct request *rq = sc->request;
>>
>> /* no command while frozen */
>> if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
>> return NULL;
>>
>> - /* the last tag is reserved for internal command. */
>> - for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
>
> blk-mq does not prevent tag ATA_TAG_INTERNAL from being using. Would it make
> sense to promote queue depth of length (ATA_MAX_QUEUE - 1) while always
> pointing ATA_TAG_INTERNAL to qcmd (see below)?
blk-mq does support a number of reserved tags, information just needs to
be passed in appropriately. So there is support for reserving X number
of error handling / emergency tags.
--
Jens Axboe
On Mon, 2013-07-29 at 07:50 -0400, Tejun Heo wrote:
> Yo,
>
> On Fri, Jul 26, 2013 at 05:43:13PM -0700, Nicholas A. Bellinger wrote:
> > Considering there can be more than a single ata_device hanging off each
> > ata_port, the '*sdev = ap->link.device[0].sdev' in __ata_qc_from_tag()
> > is definitely bogus, but I'm not sure how else to correlate
> > blk-mq/scsi-mq per device descriptors to existing code expecting
> > ata_port->qcmd[] descriptors to be shared across multiple devices..
> >
> > Tejun..?
>
> I have no idea. Let's please just do simpler conversion and worry
> about embedding qc's into scsi_cmnds later. libata isn't a normal
> SCSI driver and has a rather its own thick midlayer doing the
> impedance matching inbetween && I really don't think there is too much
> benefit to be reaped from embedding qc's into scsi_cmnds.
>
That is essentially the same conclusion that I came to, but wanted to at
least give you a chance to comment here. ;)
So that said, I'll include a simple conversion for libata into the
scsi-mq WIP branch, and folks who are interested in more detailed
conversions can pursue them as separate items.
--nab
On Mon, 2013-07-29 at 13:18 +0200, Alexander Gordeev wrote:
> On Fri, Jul 26, 2013 at 05:43:13PM -0700, Nicholas A. Bellinger wrote:
> > On Fri, 2013-07-26 at 14:14 -0700, Nicholas A. Bellinger wrote:
<SNIP>
> I also tried to make a "quick" conversion and hit the same issue(s) as you.
> Generally, I am concerned with these assumptions in such approach:
>
> 1. While libata concept of tags matches nicely with blk-mq (blk_mq_hw_ctx::
> rqs[] vs ata_port::qcmd[]) right now, it is too exposed to changes in blk-mq
> in the long run. I.e. ata_link::sactive limits tags to indices, while tags
> might become hashes. Easily fixable, but still.
>
> 2. Unallocated requests in blk-mq are accessed/analized from libata-eh.c as
> result of such iterations:
>
> for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
> qc = __ata_qc_from_tag(ap, tag);
>
> if (!(qc->flags & ATA_QCFLAG_FAILED))
> continue;
>
> ...
> }
>
> While it is probably okay right now, it is still based on a premise that
> blk-mq will not change the contents/concept of "payload", i.e. from embedded
> to (re-)allocated memory.
>
> > The thing that I'm hung up on now for existing __ata_qc_from_tag() usage
> > outside of the main blk_mq_ops->queue_rq -> SHT->queuecommand_mq()
> > dispatch path, is how to actually locate the underlying scsi_device ->
> > request_queue -> blk_mq_ctx -> blk_mq_hw_hctx from the passed
> > ata_port..?
>
> I am actually in favor of getting rid of ata_queued_cmd::tag. Converting
> ata_link::sactive to a list, making ata_link::active_tag as struct
> ata_queued_cmd *ata_link::active_qc and converting ata_port::qc_allocated to a
> list seems solves it all, including [2]. Have not checked it though.
>
> Anyway, if we need a blk-mq tag (why?), we have qc->scsicmd->request->tag.
>
Hi Alexander,
So given the feedback from Tejun, I'm going to setup back for the moment
from a larger conversion, and keep the SHT->cmd_size=0 setting for
libata in the scsi-mq WIP branch.
I'm happy to accept patches to drop the bitmap piece that Tejun
mentioned if your interested, but at least on my end right now there are
bigger fish to fry for scsi-mq. ;)
Thanks,
--nab
>> One thing which would probably be worthwhile tho is getting rid of the
>> bitmap based qc tag allocator in libata. That one is just borderline
>> stupid to keep around on any setup which is supposed to be scalable.
> Your border might be wider than mine :-). Yes, the bitmap should
> definitely go.
A naive implementation is obviously less-than-efficient. However, what
other problems exist with the libata QC tag allocator? I highly doubt
SATA will change to beyond 32 queue tags, primarily because it would
be a pain to change SDB FIS (it's likely to break the dozens of AHCI
controller implementations out there). Further, it seems like the
industry stopped caring about SATA and is pushing NVMe for future
offerings.
In any event, most modern systems should have instructions to count
leading zeroes and modify bits atomically.
-MC
On Mon, Jul 29, 2013 at 12:19 PM, Nicholas A. Bellinger
<[email protected]> wrote:
> On Mon, 2013-07-29 at 13:18 +0200, Alexander Gordeev wrote:
>> On Fri, Jul 26, 2013 at 05:43:13PM -0700, Nicholas A. Bellinger wrote:
>> > On Fri, 2013-07-26 at 14:14 -0700, Nicholas A. Bellinger wrote:
>
> <SNIP>
>
>> I also tried to make a "quick" conversion and hit the same issue(s) as you.
>> Generally, I am concerned with these assumptions in such approach:
>>
>> 1. While libata concept of tags matches nicely with blk-mq (blk_mq_hw_ctx::
>> rqs[] vs ata_port::qcmd[]) right now, it is too exposed to changes in blk-mq
>> in the long run. I.e. ata_link::sactive limits tags to indices, while tags
>> might become hashes. Easily fixable, but still.
>>
>> 2. Unallocated requests in blk-mq are accessed/analized from libata-eh.c as
>> result of such iterations:
>>
>> for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
>> qc = __ata_qc_from_tag(ap, tag);
>>
>> if (!(qc->flags & ATA_QCFLAG_FAILED))
>> continue;
>>
>> ...
>> }
>>
>> While it is probably okay right now, it is still based on a premise that
>> blk-mq will not change the contents/concept of "payload", i.e. from embedded
>> to (re-)allocated memory.
>>
>> > The thing that I'm hung up on now for existing __ata_qc_from_tag() usage
>> > outside of the main blk_mq_ops->queue_rq -> SHT->queuecommand_mq()
>> > dispatch path, is how to actually locate the underlying scsi_device ->
>> > request_queue -> blk_mq_ctx -> blk_mq_hw_hctx from the passed
>> > ata_port..?
>>
>> I am actually in favor of getting rid of ata_queued_cmd::tag. Converting
>> ata_link::sactive to a list, making ata_link::active_tag as struct
>> ata_queued_cmd *ata_link::active_qc and converting ata_port::qc_allocated to a
>> list seems solves it all, including [2]. Have not checked it though.
>>
>> Anyway, if we need a blk-mq tag (why?), we have qc->scsicmd->request->tag.
>>
>
> Hi Alexander,
>
> So given the feedback from Tejun, I'm going to setup back for the moment
> from a larger conversion, and keep the SHT->cmd_size=0 setting for
> libata in the scsi-mq WIP branch.
>
> I'm happy to accept patches to drop the bitmap piece that Tejun
> mentioned if your interested, but at least on my end right now there are
> bigger fish to fry for scsi-mq. ;)
>
> Thanks,
>
> --nab
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ide" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
Hello,
On Tue, Jul 30, 2013 at 09:16:02PM -0700, Marc C wrote:
> >> One thing which would probably be worthwhile tho is getting rid of the
> >> bitmap based qc tag allocator in libata. That one is just borderline
> >> stupid to keep around on any setup which is supposed to be scalable.
> > Your border might be wider than mine :-). Yes, the bitmap should
> > definitely go.
>
> A naive implementation is obviously less-than-efficient. However, what
> other problems exist with the libata QC tag allocator? I highly doubt
> SATA will change to beyond 32 queue tags, primarily because it would
> be a pain to change SDB FIS (it's likely to break the dozens of AHCI
> controller implementations out there). Further, it seems like the
> industry stopped caring about SATA and is pushing NVMe for future
> offerings.
>
> In any event, most modern systems should have instructions to count
> leading zeroes and modify bits atomically.
It's inefficient not because scanning is expensive but because it
makes all CPUs in the system to hit on the exact same cacheline over
and over and over again.
Thanks.
--
tejun
On Thu, Jul 25, 2013 at 12:16:41PM +0200, Alexander Gordeev wrote:
> On Mon, Jul 22, 2013 at 02:10:36PM -0700, Nicholas A. Bellinger wrote:
> > Np. FYI, you'll want to use the latest commit e7827b351 HEAD from
> > target-pending/scsi-mq, which now has functioning scsi-generic support.
>
> Survives a boot, a kernel build and the build's result :)
Not that rosy. Turned out the old code is called. Hangs with this hunk..
diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index ac05cd6..a75fd41 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1103,6 +1103,8 @@ static struct device_attribute *piix_sidpr_shost_attrs[] = {
static struct scsi_host_template piix_sidpr_sht = {
ATA_BMDMA_SHT(DRV_NAME),
.shost_attrs = piix_sidpr_shost_attrs,
+ .scsi_mq = true,
+ .queuecommand_mq = ata_scsi_queuecmd,
};
static struct ata_port_operations piix_sidpr_sata_ops = {
--
Regards,
Alexander Gordeev
[email protected]
On Mon, Jul 29, 2013 at 07:46:53AM -0400, Tejun Heo wrote:
> One thing which would probably be worthwhile tho is getting rid of the
> bitmap based qc tag allocator in libata. That one is just borderline
> stupid to keep around on any setup which is supposed to be scalable.
Hi Tejun,
How about this approach?
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index f218427..5c2a236 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -72,6 +72,8 @@
#include "libata.h"
#include "libata-transport.h"
+#include "../../block/blk-mq-tag.h"
+
/* debounce timing parameters in msecs { interval, duration, timeout } */
const unsigned long sata_deb_timing_normal[] = { 5, 100, 2000 };
const unsigned long sata_deb_timing_hotplug[] = { 25, 500, 2000 };
@@ -1569,18 +1571,8 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
/* initialize internal qc */
- /* XXX: Tag 0 is used for drivers with legacy EH as some
- * drivers choke if any other tag is given. This breaks
- * ata_tag_internal() test for those drivers. Don't use new
- * EH stuff without converting to it.
- */
- if (ap->ops->error_handler)
- tag = ATA_TAG_INTERNAL;
- else
- tag = 0;
-
- if (test_and_set_bit(tag, &ap->qc_allocated))
- BUG();
+ tag = blk_mq_get_tag(ap->qc_tags, GFP_ATOMIC, true);
+ BUG_ON(!ata_tag_internal(tag));
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
@@ -4733,21 +4725,17 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i;
+ unsigned int tag;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
- /* the last tag is reserved for internal command. */
- for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
- if (!test_and_set_bit(i, &ap->qc_allocated)) {
- qc = __ata_qc_from_tag(ap, i);
- break;
- }
+ tag = blk_mq_get_tag(ap->qc_tags, GFP_ATOMIC, false);
+ qc = __ata_qc_from_tag(ap, tag);
if (qc)
- qc->tag = i;
+ qc->tag = tag;
return qc;
}
@@ -4799,7 +4787,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
- clear_bit(tag, &ap->qc_allocated);
+ blk_mq_put_tag(ap->qc_tags, tag);
}
}
@@ -5639,6 +5627,12 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
if (!ap)
return NULL;
+ ap->qc_tags = blk_mq_init_tags(ATA_MAX_QUEUE, 1, NUMA_NO_NODE);
+ if (!ap->qc_tags) {
+ kfree(ap);
+ return NULL;
+ }
+
ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN;
ap->lock = &host->lock;
ap->print_id = -1;
@@ -5677,6 +5671,14 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
return ap;
}
+static void ata_port_free(struct ata_port *ap)
+{
+ kfree(ap->pmp_link);
+ kfree(ap->slave_link);
+ blk_mq_free_tags(ap->qc_tags);
+ kfree(ap);
+}
+
static void ata_host_release(struct device *gendev, void *res)
{
struct ata_host *host = dev_get_drvdata(gendev);
@@ -5691,9 +5693,7 @@ static void ata_host_release(struct device *gendev, void *res)
if (ap->scsi_host)
scsi_host_put(ap->scsi_host);
- kfree(ap->pmp_link);
- kfree(ap->slave_link);
- kfree(ap);
+ ata_port_free(ap);
host->ports[i] = NULL;
}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index eae7a05..4ff9494 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -126,7 +126,7 @@ enum {
ATA_DEF_QUEUE = 1,
/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
ATA_MAX_QUEUE = 32,
- ATA_TAG_INTERNAL = ATA_MAX_QUEUE - 1,
+ ATA_TAG_INTERNAL = 0,
ATA_SHORT_PAUSE = 16,
ATAPI_MAX_DRAIN = 16 << 10,
@@ -766,7 +766,7 @@ struct ata_port {
unsigned int cbl; /* cable type; ATA_CBL_xxx */
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
- unsigned long qc_allocated;
+ struct blk_mq_tags *qc_tags;
unsigned int qc_active;
int nr_active_links; /* #links with active qcs */
> Thanks.
>
> --
> tejun
--
Regards,
Alexander Gordeev
[email protected]
On Fri, Aug 09, 2013 at 10:23:35AM +0200, Alexander Gordeev wrote:
> On Mon, Jul 29, 2013 at 07:46:53AM -0400, Tejun Heo wrote:
> > One thing which would probably be worthwhile tho is getting rid of the
> > bitmap based qc tag allocator in libata. That one is just borderline
> > stupid to keep around on any setup which is supposed to be scalable.
>
> Hi Tejun,
>
> How about this approach?
Haven't looked at it thoroughly and I still don't know anything about
blk-mq but it looks good on the first glance.
Thanks.
--
tejun
On 08/09/2013 02:23 AM, Alexander Gordeev wrote:
> On Mon, Jul 29, 2013 at 07:46:53AM -0400, Tejun Heo wrote:
>> One thing which would probably be worthwhile tho is getting rid of the
>> bitmap based qc tag allocator in libata. That one is just borderline
>> stupid to keep around on any setup which is supposed to be scalable.
>
> Hi Tejun,
>
> How about this approach?
>
> @@ -5639,6 +5627,12 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
> if (!ap)
> return NULL;
>
> + ap->qc_tags = blk_mq_init_tags(ATA_MAX_QUEUE, 1, NUMA_NO_NODE);
> + if (!ap->qc_tags) {
> + kfree(ap);
> + return NULL;
> + }
This should be blk_mq_init_tags(ATA_MAX_QUEUE - 1, 1, ...) since the
total depth is normal_tags + reserved_tags. Apart from that, I think it
looks alright based on a cursory look.
--
Jens Axboe
On Fri, Aug 09, 2013 at 08:24:38AM -0600, Jens Axboe wrote:
> On 08/09/2013 02:23 AM, Alexander Gordeev wrote:
> > + ap->qc_tags = blk_mq_init_tags(ATA_MAX_QUEUE, 1, NUMA_NO_NODE);
> > + if (!ap->qc_tags) {
> > + kfree(ap);
> > + return NULL;
> > + }
>
> This should be blk_mq_init_tags(ATA_MAX_QUEUE - 1, 1, ...) since the
> total depth is normal_tags + reserved_tags.
Aha.. If blk_mq_init_tags() should be like this then?
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcbc2a4..b131a48 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -468,10 +468,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
* Rest of the tags start at the queue list
*/
tags->nr_free = 0;
- while (nr_tags - tags->nr_reserved) {
+ while (nr_tags--) {
tags->freelist[tags->nr_free] = tags->nr_free +
tags->nr_reserved;
- nr_tags--;
tags->nr_free++;
}
> --
> Jens Axboe
--
Regards,
Alexander Gordeev
[email protected]
On 08/09/2013 09:07 AM, Alexander Gordeev wrote:
> On Fri, Aug 09, 2013 at 08:24:38AM -0600, Jens Axboe wrote:
>> On 08/09/2013 02:23 AM, Alexander Gordeev wrote:
>>> + ap->qc_tags = blk_mq_init_tags(ATA_MAX_QUEUE, 1, NUMA_NO_NODE);
>>> + if (!ap->qc_tags) {
>>> + kfree(ap);
>>> + return NULL;
>>> + }
>>
>> This should be blk_mq_init_tags(ATA_MAX_QUEUE - 1, 1, ...) since the
>> total depth is normal_tags + reserved_tags.
>
> Aha.. If blk_mq_init_tags() should be like this then?
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index dcbc2a4..b131a48 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -468,10 +468,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> * Rest of the tags start at the queue list
> */
> tags->nr_free = 0;
> - while (nr_tags - tags->nr_reserved) {
> + while (nr_tags--) {
> tags->freelist[tags->nr_free] = tags->nr_free +
> tags->nr_reserved;
> - nr_tags--;
> tags->nr_free++;
> }
I misremembered, just checked the code. I think I used to have it like I
described, but changed it since I thought it would be more logical to
pass in full depth, and then what part of that is reserved. Looking at
the current code, your patch looks correct as-is.
--
Jens Axboe
On Fri, Aug 09, 2013 at 09:52:19AM -0600, Jens Axboe wrote:
> On 08/09/2013 09:07 AM, Alexander Gordeev wrote:
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index dcbc2a4..b131a48 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -468,10 +468,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> > * Rest of the tags start at the queue list
> > */
> > tags->nr_free = 0;
> > - while (nr_tags - tags->nr_reserved) {
> > + while (nr_tags--) {
> > tags->freelist[tags->nr_free] = tags->nr_free +
> > tags->nr_reserved;
> > - nr_tags--;
> > tags->nr_free++;
> > }
>
> I misremembered, just checked the code. I think I used to have it like I
> described, but changed it since I thought it would be more logical to
> pass in full depth, and then what part of that is reserved. Looking at
> the current code, your patch looks correct as-is.
Ok, then a whole series "[PATCH 0/3] blk-mq: Avoid effects of a weird queue
depth" (I posted earlier in a separate thread) should make sense. Besides
the hunk above it limits the per-cpu cache size and sanity-checks total vs
reserved length. I can resubmit if you want.
>
> --
> Jens Axboe
>
--
Regards,
Alexander Gordeev
[email protected]
On 08/09/2013 10:46 AM, Alexander Gordeev wrote:
> On Fri, Aug 09, 2013 at 09:52:19AM -0600, Jens Axboe wrote:
>> On 08/09/2013 09:07 AM, Alexander Gordeev wrote:
>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>> index dcbc2a4..b131a48 100644
>>> --- a/block/blk-mq-tag.c
>>> +++ b/block/blk-mq-tag.c
>>> @@ -468,10 +468,9 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
>>> * Rest of the tags start at the queue list
>>> */
>>> tags->nr_free = 0;
>>> - while (nr_tags - tags->nr_reserved) {
>>> + while (nr_tags--) {
>>> tags->freelist[tags->nr_free] = tags->nr_free +
>>> tags->nr_reserved;
>>> - nr_tags--;
>>> tags->nr_free++;
>>> }
>>
>> I misremembered, just checked the code. I think I used to have it like I
>> described, but changed it since I thought it would be more logical to
>> pass in full depth, and then what part of that is reserved. Looking at
>> the current code, your patch looks correct as-is.
>
> Ok, then a whole series "[PATCH 0/3] blk-mq: Avoid effects of a weird queue
> depth" (I posted earlier in a separate thread) should make sense. Besides
> the hunk above it limits the per-cpu cache size and sanity-checks total vs
> reserved length. I can resubmit if you want.
You don't have to resubmit, I'll get it reviewed and applied today.
--
Jens Axboe
On Sat, Jul 20, 2013 at 09:48:28AM -0500, Mike Christie wrote:
> What about the attached only compile tested patch. The patch has the mq
> block code work like the non mq code for bio cleanups.
Not sure if it is related to the patch or not, but it never returns from
wait_for_completion_io(&wait) in blkdev_issue_flush():
# ps axl | awk '$10 ~ /D\+/'
4 0 938 879 20 0 111216 656 blkdev D+ pts/1 0:00 fdisk/dev/sda
#
# cat /proc/938/stack
[<ffffffff812a8a5c>] blkdev_issue_flush+0xfc/0x160
[<ffffffff811ac606>] blkdev_fsync+0x96/0xc0
[<ffffffff811a2f4d>] do_fsync+0x5d/0x90
[<ffffffff811a3330>] SyS_fsync+0x10/0x20
[<ffffffff81611582>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
Any ideas?
Thanks!
--
Regards,
Alexander Gordeev
[email protected]
On Fri, 2013-08-09 at 21:15 +0200, Alexander Gordeev wrote:
> On Sat, Jul 20, 2013 at 09:48:28AM -0500, Mike Christie wrote:
> > What about the attached only compile tested patch. The patch has the mq
> > block code work like the non mq code for bio cleanups.
>
> Not sure if it is related to the patch or not, but it never returns from
> wait_for_completion_io(&wait) in blkdev_issue_flush():
>
> # ps axl | awk '$10 ~ /D\+/'
> 4 0 938 879 20 0 111216 656 blkdev D+ pts/1 0:00 fdisk/dev/sda
> #
> # cat /proc/938/stack
> [<ffffffff812a8a5c>] blkdev_issue_flush+0xfc/0x160
> [<ffffffff811ac606>] blkdev_fsync+0x96/0xc0
> [<ffffffff811a2f4d>] do_fsync+0x5d/0x90
> [<ffffffff811a3330>] SyS_fsync+0x10/0x20
> [<ffffffff81611582>] system_call_fastpath+0x16/0x1b
> [<ffffffffffffffff>] 0xffffffffffffffff
>
> Any ideas?
>
Mmmm, I'm able to reproduce over here with ahci + scsi-mq, and it
appears to be a bug related with using sdev->sdev_md_req.queue_depth=1,
that ends up causing the blkdev_issue_flush() to wait forever because
blk_mq_wait_for_tags() never ends up getting the single tag back for the
WRITE_FLUSH bio -> SYNCHRONIZE_CACHE cdb.
Here's the echo w > /proc/sysrq-trigger output:
[ 282.620140] SysRq : Show Blocked State
[ 282.620958] task PC stack pid father
[ 282.622228] kworker/2:1H D 0000000000000002 0 532 2 0x00000000
[ 282.623607] Workqueue: kblockd mq_flush_work
[ 282.624027] ffff880037869c98 0000000000000046 ffff880037868010 0000000000011380
[ 282.624027] ffff88007d255910 0000000000011380 ffff880037869fd8 0000000000011380
[ 282.624027] ffff880037869fd8 0000000000011380 ffff88007d06f0d0 ffff88007d255910
[ 282.624027] Call Trace:
[ 282.624027] [<ffffffff8125b4fd>] ? do_rw_taskfile+0x2ab/0x2bf
[ 282.624027] [<ffffffff810235c4>] ? kvm_clock_read+0x1f/0x21
[ 282.624027] [<ffffffff81054fbc>] ? update_curr+0x4f/0xcd
[ 282.624027] [<ffffffff810235c4>] ? kvm_clock_read+0x1f/0x21
[ 282.624027] [<ffffffff810235cf>] ? kvm_clock_get_cycles+0x9/0xb
[ 282.624027] [<ffffffff81383946>] schedule+0x5f/0x61
[ 282.624027] [<ffffffff813839cf>] io_schedule+0x87/0xca
[ 282.624027] [<ffffffff81192402>] wait_on_tags+0x10f/0x146
[ 282.624027] [<ffffffff81192462>] blk_mq_wait_for_tags+0x29/0x3b
[ 282.624027] [<ffffffff8119132d>] blk_mq_alloc_request_pinned+0xcf/0xe5
[ 282.624027] [<ffffffff811913a6>] blk_mq_alloc_request+0x2d/0x34
[ 282.624027] [<ffffffff8118c60f>] mq_flush_work+0x1a/0x3d
[ 282.624027] [<ffffffff8104474b>] process_one_work+0x257/0x368
[ 282.624027] [<ffffffff81044a4a>] worker_thread+0x1ee/0x34b
[ 282.624027] [<ffffffff8104485c>] ? process_one_work+0x368/0x368
[ 282.624027] [<ffffffff81049771>] kthread+0xb0/0xb8
[ 282.624027] [<ffffffff810496c1>] ? kthread_freezable_should_stop+0x60/0x60
[ 282.624027] [<ffffffff8138a07c>] ret_from_fork+0x7c/0xb0
[ 282.624027] [<ffffffff810496c1>] ? kthread_freezable_should_stop+0x60/0x60
[ 282.624027] fdisk D 0000000000000002 0 1947 1930 0x00000000
[ 282.624027] ffff880037bd9d48 0000000000000082 ffff880037bd8010 0000000000011380
[ 282.624027] ffff88007ca223a0 0000000000011380 ffff880037bd9fd8 0000000000011380
[ 282.624027] ffff880037bd9fd8 0000000000011380 ffff88007d06bb60 ffff88007ca223a0
[ 282.624027] Call Trace:
[ 282.624027] [<ffffffff813835e8>] ? __schedule+0x687/0x726
[ 282.624027] [<ffffffff81383946>] schedule+0x5f/0x61
[ 282.624027] [<ffffffff81381d18>] schedule_timeout+0x24/0x183
[ 282.624027] [<ffffffff810235c4>] ? kvm_clock_read+0x1f/0x21
[ 282.624027] [<ffffffff810235cf>] ? kvm_clock_get_cycles+0x9/0xb
[ 282.624027] [<ffffffff8105d2bd>] ? ktime_get_ts+0x53/0xc7
[ 282.624027] [<ffffffff81382e01>] io_schedule_timeout+0x93/0xe4
[ 282.624027] [<ffffffff81051fc7>] ? __cond_resched+0x25/0x31
[ 282.624027] [<ffffffff81383c46>] T.1554+0x8e/0xfc
[ 282.624027] [<ffffffff81054159>] ? try_to_wake_up+0x222/0x222
[ 282.624027] [<ffffffff81383cc7>] wait_for_completion_io+0x13/0x15
[ 282.624027] [<ffffffff8118cbde>] blkdev_issue_flush+0xfb/0x145
[ 282.624027] [<ffffffff810f067a>] blkdev_fsync+0x30/0x3d
[ 282.624027] [<ffffffff810e9259>] vfs_fsync_range+0x18/0x21
[ 282.624027] [<ffffffff810e9279>] vfs_fsync+0x17/0x19
[ 282.624027] [<ffffffff810e942e>] do_fsync+0x35/0x53
[ 282.624027] [<ffffffff810d5574>] ? SyS_ioctl+0x47/0x69
[ 282.624027] [<ffffffff810e9469>] SyS_fsync+0xb/0xf
[ 282.624027] [<ffffffff8138a129>] system_call_fastpath+0x16/0x1b
[ 282.624027] blkid D 0000000000000001 0 1952 1 0x00000000
[ 282.679428] ffff8800371638a8 0000000000000082 ffff880037162010 0000000000011380
[ 282.679428] ffff88007ca205f0 0000000000011380 ffff880037163fd8 0000000000011380
[ 282.679428] ffff880037163fd8 0000000000011380 ffff88007d06b570 ffff88007ca205f0
[ 282.679428] Call Trace:
[ 282.679428] [<ffffffff810677a9>] ? generic_exec_single+0x75/0x93
[ 282.679428] [<ffffffff8119212a>] ? blk_mq_tag_busy_iter+0x116/0x116
[ 282.679428] [<ffffffff8106797f>] ? smp_call_function_single+0xf9/0x111
[ 282.679428] [<ffffffff81383946>] schedule+0x5f/0x61
[ 282.679428] [<ffffffff813839cf>] io_schedule+0x87/0xca
[ 282.679428] [<ffffffff81192402>] wait_on_tags+0x10f/0x146
[ 282.679428] [<ffffffff81192462>] blk_mq_wait_for_tags+0x29/0x3b
[ 282.679428] [<ffffffff8119132d>] blk_mq_alloc_request_pinned+0xcf/0xe5
[ 282.679428] [<ffffffff811916b9>] blk_mq_make_request+0x14d/0x2dc
[ 282.679428] [<ffffffff810978c4>] ? mempool_alloc_slab+0x10/0x12
[ 282.679428] [<ffffffff8118951e>] generic_make_request+0x9c/0xdf
[ 282.679428] [<ffffffff81189648>] submit_bio+0xe7/0xf2
[ 282.679428] [<ffffffff810eaaeb>] _submit_bh+0x1b0/0x1d3
[ 282.679428] [<ffffffff810eab19>] submit_bh+0xb/0xd
[ 282.679428] [<ffffffff810ed6e5>] block_read_full_page+0x24d/0x26d
[ 282.679428] [<ffffffff810ef905>] ? I_BDEV+0xd/0xd
[ 282.679428] [<ffffffff810a7624>] ? __inc_zone_page_state+0x1e/0x20
[ 282.679428] [<ffffffff81096188>] ? add_to_page_cache_locked+0x78/0xb0
[ 282.679428] [<ffffffff810f04a5>] blkdev_readpage+0x13/0x15
[ 282.679428] [<ffffffff8109de8d>] __do_page_cache_readahead+0x194/0x1d0
[ 282.679428] [<ffffffff81381f41>] ? __wait_on_bit_lock+0x79/0x8a
[ 282.679428] [<ffffffff8109df50>] force_page_cache_readahead+0x67/0x8d
[ 282.679428] [<ffffffff8109e29a>] page_cache_sync_readahead+0x26/0x3a
[ 282.679428] [<ffffffff81097510>] generic_file_aio_read+0x265/0x5cd
[ 282.679428] [<ffffffff810efaae>] blkdev_aio_read+0x57/0x5e
[ 282.679428] [<ffffffff810c6b8d>] do_sync_read+0x79/0x9f
[ 282.679428] [<ffffffff810c7db7>] vfs_read+0xab/0x130
[ 282.679428] [<ffffffff810c7f06>] SyS_read+0x4f/0x79
[ 282.679428] [<ffffffff8138a129>] system_call_fastpath+0x16/0x1b
[ 282.679428] blkid D 0000000000000003 0 1992 927 0x00000000
[ 282.679428] ffff88007848f8a8 0000000000000086 ffff88007848e010 0000000000011380
[ 282.679428] ffff88007d250000 0000000000011380 ffff88007848ffd8 0000000000011380
[ 282.679428] ffff88007848ffd8 0000000000011380 ffff88007d06c150 ffff88007d250000
[ 282.679428] Call Trace:
[ 282.679428] [<ffffffff8109b9b6>] ? __alloc_pages_nodemask+0xf7/0x5eb
[ 282.679428] [<ffffffff81383946>] schedule+0x5f/0x61
[ 282.679428] [<ffffffff813839cf>] io_schedule+0x87/0xca
[ 282.679428] [<ffffffff81192402>] wait_on_tags+0x10f/0x146
[ 282.679428] [<ffffffff81192462>] blk_mq_wait_for_tags+0x29/0x3b
[ 282.679428] [<ffffffff8119132d>] blk_mq_alloc_request_pinned+0xcf/0xe5
[ 282.679428] [<ffffffff811916b9>] blk_mq_make_request+0x14d/0x2dc
[ 282.679428] [<ffffffff8118d81f>] ? create_task_io_context+0xa6/0xf5
[ 282.679428] [<ffffffff8118951e>] generic_make_request+0x9c/0xdf
[ 282.679428] [<ffffffff81189648>] submit_bio+0xe7/0xf2
[ 282.679428] [<ffffffff810eaaeb>] _submit_bh+0x1b0/0x1d3
[ 282.679428] [<ffffffff810eab19>] submit_bh+0xb/0xd
[ 282.679428] [<ffffffff810ed6e5>] block_read_full_page+0x24d/0x26d
[ 282.679428] [<ffffffff810ef905>] ? I_BDEV+0xd/0xd
[ 282.679428] [<ffffffff810f04a5>] blkdev_readpage+0x13/0x15
[ 282.679428] [<ffffffff8109de8d>] __do_page_cache_readahead+0x194/0x1d0
[ 282.679428] [<ffffffff8109df50>] force_page_cache_readahead+0x67/0x8d
[ 282.679428] [<ffffffff8109e29a>] page_cache_sync_readahead+0x26/0x3a
[ 282.679428] [<ffffffff81097510>] generic_file_aio_read+0x265/0x5cd
[ 282.679428] [<ffffffff810efaae>] blkdev_aio_read+0x57/0x5e
[ 282.679428] [<ffffffff810c6b8d>] do_sync_read+0x79/0x9f
[ 282.679428] [<ffffffff810c7db7>] vfs_read+0xab/0x130
[ 282.679428] [<ffffffff810c7f06>] SyS_read+0x4f/0x79
[ 282.679428] [<ffffffff8138a129>] system_call_fastpath+0x16/0x1b
Bumping queue_depth=2 seems to work-around the issue, but AFAICT it's a
genuine tag starvation bug with queue_depth=1 and WRITE_FLUSH..
--nab
On Fri, Aug 09, 2013 at 11:07:37AM -0600, Jens Axboe wrote:
> You don't have to resubmit, I'll get it reviewed and applied today.
Hi Jens,
I limited the minimal queue depth to 4, which is apparently wrong
in case of libata. I will post a new series.
> --
> Jens Axboe
>
--
Regards,
Alexander Gordeev
[email protected]
On Fri, Aug 09, 2013 at 01:17:37PM -0700, Nicholas A. Bellinger wrote:
> On Fri, 2013-08-09 at 21:15 +0200, Alexander Gordeev wrote:
> Mmmm, I'm able to reproduce over here with ahci + scsi-mq, and it
> appears to be a bug related with using sdev->sdev_md_req.queue_depth=1,
> that ends up causing the blkdev_issue_flush() to wait forever because
> blk_mq_wait_for_tags() never ends up getting the single tag back for the
> WRITE_FLUSH bio -> SYNCHRONIZE_CACHE cdb.
It turns out this way - blkdev_issue_flush() claims the only tag, submits
the bio and waits for the completion. But because blk_mq_make_request()
does not mark any context in blk_mq_hw_ctx::ctx_map (nor enslists the request
into blk_mq_ctx::rq_list) it never gets processed from blk_mq_work_fn->
__blk_mq_run_hw_queue() and blkdev_issue_flush() waits endlessly. All other
requests are just waiting for the tag availability as result.
[...]
> Bumping queue_depth=2 seems to work-around the issue, but AFAICT it's a
> genuine tag starvation bug with queue_depth=1 and WRITE_FLUSH..
If I try to hack and force __blk_mq_run_hw_queue() to process the request...
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6fc1df3..c22b6f66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -889,9 +962,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
hctx->queued++;
if (unlikely(is_flush_fua)) {
+ list_add(&rq->queuelist, &hctx->dispatch);
blk_mq_bio_to_request(q, rq, bio);
blk_mq_put_ctx(ctx);
blk_insert_flush(rq);
goto run_queue;
}
... I get a kernel BUG at drivers/scsi/scsi_lib.c:1233
BUG_ON(!req->nr_phys_segments);
IOW I am not sure how to proceed.
> --nab
>
--
Regards,
Alexander Gordeev
[email protected]
On Thu, 2013-08-15 at 18:23 +0200, Alexander Gordeev wrote:
> On Fri, Aug 09, 2013 at 01:17:37PM -0700, Nicholas A. Bellinger wrote:
> > On Fri, 2013-08-09 at 21:15 +0200, Alexander Gordeev wrote:
> > Mmmm, I'm able to reproduce over here with ahci + scsi-mq, and it
> > appears to be a bug related with using sdev->sdev_md_req.queue_depth=1,
> > that ends up causing the blkdev_issue_flush() to wait forever because
> > blk_mq_wait_for_tags() never ends up getting the single tag back for the
> > WRITE_FLUSH bio -> SYNCHRONIZE_CACHE cdb.
>
> It turns out this way - blkdev_issue_flush() claims the only tag, submits
> the bio and waits for the completion. But because blk_mq_make_request()
> does not mark any context in blk_mq_hw_ctx::ctx_map (nor enslists the request
> into blk_mq_ctx::rq_list) it never gets processed from blk_mq_work_fn->
> __blk_mq_run_hw_queue() and blkdev_issue_flush() waits endlessly. All other
> requests are just waiting for the tag availability as result.
>
Ok, here's a bit better idea of what is going on now..
The problem is that blkdev_issue_flush() -> blk_mq_make_request() ->
__blk_mq_alloc_request() allocates the first tag, which calls
blk_insert_flush() -> blk_flush_complete_seq() -> blk_flush_kick() ->
mq_flush_work() -> blk_mq_alloc_request() to allocate a second tag for
the struct request that actually gets dispatched into scsi-mq as a
SYCHRONIZE_CACHE command..
I'm not exactly sure why this double tag usage of struct request is
occurring, but AFAICT it does happen for every flush, and is not
specific to the blkdev_issue_flush() codepath.. I'm sure that Jens can
fill us in on that bit. ;)
So, assuming that this double tag usage is necessary and not a bug,
perhaps using a reserved tag for the first tag (eg: the one that's not
dispatched into scsi_mq_queue_rq) makes sense..?
I'm playing with a patch to do this, but am currently getting hung-up on
what appear to be some separate blk-mq reserved_tags > 0 bugs, the first
of which is passing queue_depth=1 + reserved_tags=1 is broken, and
results in tags->nr_free = 0.
Here's the quick fix:
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6718007..ffdf686 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -470,7 +470,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
* Rest of the tags start at the queue list
*/
tags->nr_free = 0;
- while (nr_tags - tags->nr_reserved) {
+ while (nr_tags) {
tags->freelist[tags->nr_free] = tags->nr_free +
tags->nr_reserved;
nr_tags--;
Anyways, before digging further into reserved tags logic, Jens, what are
your thoughts for addressing this special queue_depth=1 case for libata
+ the like..?
--nab
On Thu, Aug 15, 2013 at 07:19:29PM -0700, Nicholas A. Bellinger wrote:
> I'm playing with a patch to do this, but am currently getting hung-up on
> what appear to be some separate blk-mq reserved_tags > 0 bugs, the first
> of which is passing queue_depth=1 + reserved_tags=1 is broken, and
> results in tags->nr_free = 0.
That is not a bug - please look at Jens replies in this thread some week ago.
In short, queue_depth=1 means 1 tags in total and reserved_tags=1 results
in zero normal tags. You need to request the depth=2 and reserved_tags=1.
But yes, this is a separate topic and I am looking forward to hear from Jens
wrt flushes.
--
Regards,
Alexander Gordeev
[email protected]
On Fri, 2013-08-16 at 18:41 +0200, Alexander Gordeev wrote:
> On Thu, Aug 15, 2013 at 07:19:29PM -0700, Nicholas A. Bellinger wrote:
> > I'm playing with a patch to do this, but am currently getting hung-up on
> > what appear to be some separate blk-mq reserved_tags > 0 bugs, the first
> > of which is passing queue_depth=1 + reserved_tags=1 is broken, and
> > results in tags->nr_free = 0.
>
> That is not a bug - please look at Jens replies in this thread some week ago.
> In short, queue_depth=1 means 1 tags in total and reserved_tags=1 results
> in zero normal tags. You need to request the depth=2 and reserved_tags=1.
>
Ahhh, yes of course. I'll re-work a proposed patch this afternoon with
this in mind..
> But yes, this is a separate topic and I am looking forward to hear from Jens
> wrt flushes.
>
Indeed. ;)
--nab
On Thu, Aug 15, 2013 at 07:19:29PM -0700, Nicholas A. Bellinger wrote:
> Anyways, before digging further into reserved tags logic, Jens, what are
> your thoughts for addressing this special queue_depth=1 case for libata
> + the like..?
Hi Jens,
Have some comments?
--
Regards,
Alexander Gordeev
[email protected]
On Thu, Aug 15, 2013 at 07:19:29PM -0700, Nicholas A. Bellinger wrote:
> Ok, here's a bit better idea of what is going on now..
>
> The problem is that blkdev_issue_flush() -> blk_mq_make_request() ->
> __blk_mq_alloc_request() allocates the first tag, which calls
> blk_insert_flush() -> blk_flush_complete_seq() -> blk_flush_kick() ->
> mq_flush_work() -> blk_mq_alloc_request() to allocate a second tag for
> the struct request that actually gets dispatched into scsi-mq as a
> SYCHRONIZE_CACHE command..
>
> I'm not exactly sure why this double tag usage of struct request is
> occurring, but AFAICT it does happen for every flush, and is not
> specific to the blkdev_issue_flush() codepath.. I'm sure that Jens can
> fill us in on that bit. ;)
I also played with the double tag using a reserved tag (below).
While it fixes 'fdisk /dev/sda' issue when trying to 'mount /dev/sda1 /mnt'
what appears to be a call to bio->bi_end_io() from the free'd bio hits in.
Not sure if I should pursue the root cause until the whole double-tag
thingy is confirmed.
Jens?
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6fc1df3..81794dc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -874,14 +874,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
hctx = q->mq_ops->map_queue(q, ctx->cpu);
trace_block_getrq(q, bio, rw);
- rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
+ rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, is_flush_fua);
if (likely(rq))
blk_mq_rq_ctx_init(ctx, rq, rw);
else {
blk_mq_put_ctx(ctx);
trace_block_sleeprq(q, bio, rw);
rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
- false);
+ is_flush_fua);
ctx = rq->mq_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
}
@@ -1317,6 +1317,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
reg->queue_depth = BLK_MQ_MAX_DEPTH;
}
+ reg->queue_depth++;
+ reg->reserved_tags++;
+
ctx = alloc_percpu(struct blk_mq_ctx);
if (!ctx)
return ERR_PTR(-ENOMEM);
--
Regards,
Alexander Gordeev
[email protected]
Hi Alexander!
Apologies for the long delay on this follow-up.. Comments below.
On Fri, 2013-09-20 at 17:19 +0200, Alexander Gordeev wrote:
> On Thu, Aug 15, 2013 at 07:19:29PM -0700, Nicholas A. Bellinger wrote:
> > Ok, here's a bit better idea of what is going on now..
> >
> > The problem is that blkdev_issue_flush() -> blk_mq_make_request() ->
> > __blk_mq_alloc_request() allocates the first tag, which calls
> > blk_insert_flush() -> blk_flush_complete_seq() -> blk_flush_kick() ->
> > mq_flush_work() -> blk_mq_alloc_request() to allocate a second tag for
> > the struct request that actually gets dispatched into scsi-mq as a
> > SYCHRONIZE_CACHE command..
> >
> > I'm not exactly sure why this double tag usage of struct request is
> > occurring, but AFAICT it does happen for every flush, and is not
> > specific to the blkdev_issue_flush() codepath.. I'm sure that Jens can
> > fill us in on that bit. ;)
>
> I also played with the double tag using a reserved tag (below).
>
> While it fixes 'fdisk /dev/sda' issue when trying to 'mount /dev/sda1 /mnt'
> what appears to be a call to bio->bi_end_io() from the free'd bio hits in.
>
> Not sure if I should pursue the root cause until the whole double-tag
> thingy is confirmed.
>
> Jens?
>
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 6fc1df3..81794dc 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -874,14 +874,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
> hctx = q->mq_ops->map_queue(q, ctx->cpu);
>
> trace_block_getrq(q, bio, rw);
> - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
> + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, is_flush_fua);
> if (likely(rq))
> blk_mq_rq_ctx_init(ctx, rq, rw);
> else {
> blk_mq_put_ctx(ctx);
> trace_block_sleeprq(q, bio, rw);
> rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
> - false);
> + is_flush_fua);
> ctx = rq->mq_ctx;
> hctx = q->mq_ops->map_queue(q, ctx->cpu);
> }
So this is what I ended up doing as well, and does address the specific
bug with queue_depth=1.
> @@ -1317,6 +1317,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
> reg->queue_depth = BLK_MQ_MAX_DEPTH;
> }
>
> + reg->queue_depth++;
> + reg->reserved_tags++;
> +
> ctx = alloc_percpu(struct blk_mq_ctx);
> if (!ctx)
> return ERR_PTR(-ENOMEM);
>
I was actually setting this within scsi_mq_alloc_queue(), but given that
the queue_depth=1 issue is independent of scsi-mq, this does make more
sense.
Also, these extra increments should probably happen only when the passed
queue_depth == 1 && reserved_tags == 0.
Other than that minor nit.
Reviewed-by: Nicholas Bellinger <[email protected]>
On Sat, Jul 20, 2013 at 09:48:28AM -0500, Mike Christie wrote:
> What about the attached only compile tested patch. The patch has the mq
> block code work like the non mq code for bio cleanups.
>
>
> blk-mq: blk-mq should free bios in pass through case
>
> For non mq calls, the block layer will free the bios when
> blk_finish_request is called.
e
> For mq calls, the blk mq code wants the caller to do this.
>
> This patch has the blk mq code work like the non mq code
> and has the block layer free the bios.
>
> Signed-off-by: Mike Christie <[email protected]>
This patch breaks booting for me in the current blk multiqueue tree,
with an apparent double free of a bio when using virtio-blk in writeback
mode (cache=writeback or cache=none in qemu):
[ 15.253608] ------------[ cut here ]------------
[ 15.256422] kernel BUG at /work/hch/linux/fs/bio.c:498!
[ 15.256879] invalid opcode: 0000 [#1] SMP
[ 15.256879] Modules linked in:
[ 15.256879] CPU: 3 PID: 353 Comm: kblockd Not tainted 3.11.0+ #25
[ 15.256879] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 15.256879] task: ffff88007d75e0c0 ti: ffff88007d676000 task.ti: ffff88007d676000
[ 15.256879] RIP: 0010:[<ffffffff811b470a>] [<ffffffff811b470a>] bio_put+0x8a/0x90
[ 15.256879] RSP: 0018:ffff88007fd83b50 EFLAGS: 00010046
[ 15.256879] RAX: 0000000000000000 RBX: ffff88007d713080 RCX: 0000000000000035
[ 15.256879] RDX: 0000000000000002 RSI: ffff88007ad50338 RDI: ffff88007d713080
[ 15.256879] RBP: ffff88007fd83b60 R08: 7010000000000000 R09: 007ad50338080000
[ 15.256879] R10: ff672b1b7d38ce02 R11: 000000000000028b R12: 0000000000000000
[ 15.256879] R13: 0000000000000000 R14: ffff88007b4c36c0 R15: ffff88007b40d608
[ 15.256879] FS: 0000000000000000(0000) GS:ffff88007fd80000(0000) knlGS:0000000000000000
[ 15.256879] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 15.256879] CR2: 0000000000000138 CR3: 0000000002124000 CR4: 00000000000006e0
[ 15.256879] Stack:
[ 15.256879] ffff88007d713080 0000000000000000 ffff88007fd83b80 ffffffff811ae8a3
[ 15.256879] ffff88007fd83bf0 0000000000001000 ffff88007fd83b90 ffffffff811b3268
[ 15.256879] ffff88007fd83bc0 ffffffff816ac847 ffff88007b4c36c0 ffff88007fd99d00
[ 15.256879] Call Trace:
[ 15.256879] <IRQ>
[ 15.256879] [<ffffffff811ae8a3>] end_bio_bh_io_sync+0x33/0x50
[ 15.256879] [<ffffffff811b3268>] bio_endio+0x18/0x30
[ 15.256879] [<ffffffff816ac847>] blk_mq_complete_request+0x47/0xd0
[ 15.256879] [<ffffffff816ac8e9>] __blk_mq_end_io+0x19/0x20
[ 15.256879] [<ffffffff816ac958>] blk_mq_end_io+0x68/0xd0
[ 15.256879] [<ffffffff816a6162>] blk_flush_complete_seq+0xe2/0x370
[ 15.256879] [<ffffffff816a653b>] flush_end_io+0x11b/0x200
[ 15.256879] [<ffffffff816ac875>] blk_mq_complete_request+0x75/0xd0
[ 15.256879] [<ffffffff816ac8e9>] __blk_mq_end_io+0x19/0x20
[ 15.256879] [<ffffffff816ac958>] blk_mq_end_io+0x68/0xd0
[ 15.256879] [<ffffffff81844c2f>] virtblk_done+0xef/0x260
[ 15.256879] [<ffffffff81753cc0>] vring_interrupt+0x30/0x60
[ 15.256879] [<ffffffff81103724>] handle_irq_event_percpu+0x54/0x1f0
[ 15.256879] [<ffffffff81103903>] handle_irq_event+0x43/0x70
[ 15.256879] [<ffffffff8110609f>] handle_edge_irq+0x6f/0x120
[ 15.256879] [<ffffffff810445b8>] handle_irq+0x58/0x140
[ 15.256879] [<ffffffff81094bbf>] ? irq_enter+0x4f/0x90
[ 15.256879] [<ffffffff810440b5>] do_IRQ+0x55/0xd0
[ 15.256879] [<ffffffff81bd3972>] common_interrupt+0x72/0x72
[ 15.256879] [<ffffffff810c5135>] ? sched_clock_local+0x25/0xa0
[ 15.256879] [<ffffffff81094960>] ? __do_softirq+0xb0/0x250
[ 15.256879] [<ffffffff81094959>] ? __do_softirq+0xa9/0x250
[ 15.256879] [<ffffffff81094cae>] irq_exit+0xae/0xd0
[ 15.256879] [<ffffffff8106dcd5>] smp_apic_timer_interrupt+0x45/0x60
[ 15.256879] [<ffffffff81bdc772>] apic_timer_interrupt+0x72/0x80
[ 15.256879] <EOI>
[ 15.256879] [<ffffffff81bd3a33>] ? retint_restore_args+0x13/0x13
[ 15.256879] [<ffffffff81bd3502>] ? _raw_spin_unlock_irq+0x32/0x40
[ 15.256879] [<ffffffff81bd34fb>] ? _raw_spin_unlock_irq+0x2b/0x40
[ 15.256879] [<ffffffff810ac0c4>] rescuer_thread+0xe4/0x2f0
[ 15.256879] [<ffffffff810abfe0>] ? process_scheduled_works+0x40/0x40
[ 15.256879] [<ffffffff810b3916>] kthread+0xd6/0xe0
[ 15.256879] [<ffffffff81bd34fb>] ? _raw_spin_unlock_irq+0x2b/0x40
[ 15.256879] [<ffffffff810b3840>] ? __init_kthread_worker+0x70/0x70
[ 15.256879] [<ffffffff81bdbabc>] ret_from_fork+0x7c/0xb0
[ 15.256879] [<ffffffff810b3840>] ? __init_kthread_worker+0x70/0x70
[ 15.256879] Code: ff 41 8b 44 24 08 48 89 df 49 8b 74 24 10 48 29 c7 e8 cb 88 f8 ff 48 8b 5d f0 4c 8b 65 f8 c9 c3 90 48 89 df e8 b8 5c fc ff eb 9b <0f> 0b 0f 1f 40 00 55 48 89 e5 41 57 45 31 ff 41 56 41 55 41 54
[ 15.256879] RIP [<ffffffff811b470a>] bio_put+0x8a/0x90
[ 15.256879] RSP <ffff88007fd83b50>
[ 15.256879] ---[ end trace 1f201608bfddfca7 ]---
[ 15.256879] Kernel panic - not syncing: Fatal exception in interrupt
[ 15.256879] Shutting down cpus with NMI
On Thu, Oct 03, 2013 at 04:06:51AM -0700, Christoph Hellwig wrote:
> On Sat, Jul 20, 2013 at 09:48:28AM -0500, Mike Christie wrote:
> > What about the attached only compile tested patch. The patch has the mq
> > block code work like the non mq code for bio cleanups.
> >
> >
>
> > blk-mq: blk-mq should free bios in pass through case
> >
> > For non mq calls, the block layer will free the bios when
> > blk_finish_request is called.
> e
> > For mq calls, the blk mq code wants the caller to do this.
> >
> > This patch has the blk mq code work like the non mq code
> > and has the block layer free the bios.
> >
> > Signed-off-by: Mike Christie <[email protected]>
>
> This patch breaks booting for me in the current blk multiqueue tree,
> with an apparent double free of a bio when using virtio-blk in writeback
> mode (cache=writeback or cache=none in qemu):
I am not sure if the root cause the same, but the panic I experience with
mounting a ahci device (and those double-tag usage described in another
thread) is somehow similar:
[ 181.184510] general protection fault: 0000 [#1] SMP
[ 181.184546] Modules linked in: lockd sunrpc snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_hwdep snd_pcm mperf snd_page_alloc i5000_edac coretemp snd_timer edac_core iTCO_wdt snd kvm_intel iTCO_vendor_support lpc_ich mfd_core igb dca i5k_amb ppdev soundcore hp_wmi tg3 kvm sparse_keymap serio_raw ptp microcode pcspkr rfkill pps_core shpchp parport_pc parport mptsas scsi_transport_sas mptscsih mptbase floppy nouveau video mxm_wmi wmi i2c_algo_bit drm_kms_helper ttm drm i2c_core
[ 181.184550] CPU: 6 PID: 0 Comm: swapper/6 Tainted: G W 3.10.0-rc5.debug+ #180
[ 181.184552] Hardware name: Hewlett-Packard HP xw6400 Workstation/0A04h, BIOS 786D4 v02.31 03/14/2008
[ 181.184554] task: ffff88007b1a8000 ti: ffff88007b19c000 task.ti: ffff88007b19c000
[ 181.184563] RIP: 0010:[<ffffffff811fa97b>] [<ffffffff811fa97b>] bio_endio+0x1b/0x40
[ 181.184565] RSP: 0018:ffff88007d203a28 EFLAGS: 00010002
[ 181.184567] RAX: 6b6b6b6b6b6b6b6b RBX: 0000000000000000 RCX: dead000000200200
[ 181.184568] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880068e29200
[ 181.184570] RBP: ffff88007d203a28 R08: ffffe8ffff201240 R09: 0000000000000000
[ 181.184571] R10: 0000000000000000 R11: 0000000000000001 R12: ffff880074d86000
[ 181.184572] R13: 6b6b6b6b6b6b6b6b R14: 000000006b6b6b6b R15: 0000000000000001
[ 181.184575] FS: 0000000000000000(0000) GS:ffff88007d200000(0000) knlGS:0000000000000000
[ 181.184576] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 181.184578] CR2: 00007f8afac8c45c CR3: 000000005f431000 CR4: 00000000000007e0
[ 181.184580] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 181.184581] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 181.184582] Stack:
[ 181.184587] ffff88007d203a78 ffffffff813202aa ffff88007d203a98 0000000000000046
[ 181.184591] 0000000000000046 ffff880072d08000 0000000000000000 ffff880074d860d8
[ 181.184594] 0000000000000000 0000000000000001 ffff88007d203a88 ffffffff81320445
[ 181.184595] Call Trace:
[ 181.184597] <IRQ>
[ 181.184603] [<ffffffff813202aa>] blk_mq_complete_request+0x5a/0x1d0
[ 181.184607] [<ffffffff81320445>] __blk_mq_end_io+0x25/0x30
[ 181.184609] [<ffffffff81320535>] blk_mq_end_io+0xe5/0xf0
[ 181.184613] [<ffffffff81319754>] blk_flush_complete_seq+0xf4/0x360
[ 181.184616] [<ffffffff81319a4b>] ? flush_end_io+0x4b/0x210
[ 181.184619] [<ffffffff81319b2a>] flush_end_io+0x12a/0x210
[ 181.184622] [<ffffffff813202da>] blk_mq_complete_request+0x8a/0x1d0
[ 181.184626] [<ffffffff8147b9fd>] ? scsi_device_unbusy+0x9d/0xd0
[ 181.184629] [<ffffffff81320445>] __blk_mq_end_io+0x25/0x30
[ 181.184632] [<ffffffff81320535>] blk_mq_end_io+0xe5/0xf0
[ 181.184635] [<ffffffff8147cae5>] scsi_mq_end_request+0x15/0x20
[ 181.184638] [<ffffffff8147bf20>] scsi_io_completion+0xa0/0x650
[ 181.184643] [<ffffffff810bbc3d>] ? trace_hardirqs_off+0xd/0x10
[ 181.184648] [<ffffffff814722f7>] scsi_finish_command+0x87/0xe0
[ 181.184650] [<ffffffff8147bccf>] scsi_softirq_done+0x13f/0x160
[ 181.184653] [<ffffffff8147cba5>] scsi_mq_done+0x15/0x20
[ 181.184658] [<ffffffff81495a73>] ata_scsi_qc_complete+0x63/0x470
[ 181.184661] [<ffffffff8148fad0>] __ata_qc_complete+0x90/0x140
[ 181.184664] [<ffffffff8148fc1d>] ata_qc_complete+0x9d/0x230
[ 181.184667] [<ffffffff8148fe51>] ata_qc_complete_multiple+0xa1/0xe0
[ 181.184673] [<ffffffff814aa449>] ahci_handle_port_interrupt+0x109/0x560
[ 181.184676] [<ffffffff814ab63f>] ahci_port_intr+0x2f/0x40
[ 181.184678] [<ffffffff814ab6f1>] ahci_interrupt+0xa1/0x100
[ 181.184683] [<ffffffff810ff7b5>] handle_irq_event_percpu+0x75/0x3d0
[ 181.184686] [<ffffffff810ffb58>] handle_irq_event+0x48/0x70
[ 181.184689] [<ffffffff81102d9e>] ? handle_fasteoi_irq+0x1e/0x100
[ 181.184692] [<ffffffff81102dda>] handle_fasteoi_irq+0x5a/0x100
[ 181.184696] [<ffffffff81004320>] handle_irq+0x60/0x150
[ 181.184702] [<ffffffff816ff846>] ? atomic_notifier_call_chain+0x16/0x20
[ 181.184706] [<ffffffff81705f7a>] do_IRQ+0x5a/0xe0
[ 181.184710] [<ffffffff816fb52f>] common_interrupt+0x6f/0x6f
[ 181.184712] <EOI>
[ 181.184716] [<ffffffff8100aa45>] ? default_idle+0x25/0x280
[ 181.184719] [<ffffffff8100aa43>] ? default_idle+0x23/0x280
[ 181.184722] [<ffffffff8100b4f6>] arch_cpu_idle+0x26/0x30
[ 181.184726] [<ffffffff810afe66>] cpu_startup_entry+0x96/0x3e0
[ 181.184729] [<ffffffff810b7ad5>] ? clockevents_register_device+0xb5/0x120
[ 181.184734] [<ffffffff816e67ea>] start_secondary+0x27a/0x27c
[ 181.184767] Code: 47 c1 c1 ea 03 83 c2 01 39 d0 0f 47 c2 c3 66 90 66 66 66 66 90 55 85 f6 48 89 e5 74 1b f0 80 67 18 fe 48 8b 47 40 48 85 c0 74 02 <ff> d0 5d 66 90 c3 0f 1f 80 00 00 00 00 48 8b 47 18 a8 01 b8 fb
[ 181.184770] RIP [<ffffffff811fa97b>] bio_endio+0x1b/0x40
[ 181.184772] RSP <ffff88007d203a28>
[ 181.184777] ---[ end trace 5e8fd083b8562c3c ]---
[ 181.184779] Kernel panic - not syncing: Fatal exception in interrupt
[ 181.185483] drm_kms_helper: panic occurred, switching back to text console
--
Regards,
Alexander Gordeev
[email protected]