Hi Tejun,
This update is against "scsi-mq" branch in Nicholas tree:
git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git
A quick check with 'fio' tool's random reading indicates an
moderate improvement on a single drive, four threads and the
queue depth of 32:
read : io=131072KB, bw=149785B/s, iops=36, runt=896066msec
vs
read : io=131072KB, bw=166884B/s, iops=40, runt=804257msec
I have not measured cache line bouncing itself. I am kind of
limited in hardware configurations, but let me know if you
want to see more testing anyway.
Thanks!
Alexander Gordeev (1):
libata: Get rid of ata_port::qc_allocated bitmask
drivers/ata/libata-core.c | 82 +++++++++++++++++++++++++++++++-------------
include/linux/libata.h | 6 ++-
2 files changed, 62 insertions(+), 26 deletions(-)
--
1.7.7.6
Scanning ata_port::qc_allocated bitmask for free tags has
been a performance bottleneck due to the cache line bouncing.
This update replaces ata_port::qc_allocated bitmask with
optimized for parallel access percpu_ida tags allocator.
Signed-off-by: Alexander Gordeev <[email protected]>
---
drivers/ata/libata-core.c | 82 +++++++++++++++++++++++++++++++-------------
include/linux/libata.h | 6 ++-
2 files changed, 62 insertions(+), 26 deletions(-)
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 75b9367..88af198 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -68,6 +68,7 @@
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/platform_device.h>
+#include <linux/percpu_ida.h>
#include "libata.h"
#include "libata-transport.h"
@@ -1519,6 +1520,34 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
complete(waiting);
}
+static unsigned int __ata_get_internal_tag(struct ata_port *ap)
+{
+ int tag = percpu_ida_alloc(&ap->qc_internal, GFP_NOWAIT);
+
+ if (unlikely(tag < 0))
+ return -1;
+
+ return ATA_TAG_INTERNAL;
+}
+
+static unsigned int __ata_get_tag(struct ata_port *ap)
+{
+ int tag = percpu_ida_alloc(&ap->qc_tags, GFP_NOWAIT);
+
+ if (unlikely(tag < 0))
+ return -1;
+
+ return (unsigned int)tag + (ATA_TAG_INTERNAL + 1);
+}
+
+static void __ata_put_tag(struct ata_port *ap, unsigned int tag)
+{
+ if (unlikely(tag == ATA_TAG_INTERNAL))
+ percpu_ida_free(&ap->qc_internal, 0);
+ else
+ percpu_ida_free(&ap->qc_tags, tag - (ATA_TAG_INTERNAL + 1));
+}
+
/**
* ata_exec_internal_sg - execute libata internal command
* @dev: Device to which the command is sent
@@ -1569,18 +1598,8 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
/* initialize internal qc */
- /* XXX: Tag 0 is used for drivers with legacy EH as some
- * drivers choke if any other tag is given. This breaks
- * ata_tag_internal() test for those drivers. Don't use new
- * EH stuff without converting to it.
- */
- if (ap->ops->error_handler)
- tag = ATA_TAG_INTERNAL;
- else
- tag = 0;
-
- if (test_and_set_bit(tag, &ap->qc_allocated))
- BUG();
+ tag = __ata_get_internal_tag(ap);
+ BUG_ON(!ata_tag_internal(tag));
qc = __ata_qc_from_tag(ap, tag);
qc->tag = tag;
@@ -4750,21 +4769,17 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
{
struct ata_queued_cmd *qc = NULL;
- unsigned int i;
+ unsigned int tag;
/* no command while frozen */
if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
return NULL;
- /* the last tag is reserved for internal command. */
- for (i = 0; i < ATA_MAX_QUEUE - 1; i++)
- if (!test_and_set_bit(i, &ap->qc_allocated)) {
- qc = __ata_qc_from_tag(ap, i);
- break;
- }
+ tag = __ata_get_tag(ap);
+ qc = __ata_qc_from_tag(ap, tag);
if (qc)
- qc->tag = i;
+ qc->tag = tag;
return qc;
}
@@ -4816,7 +4831,7 @@ void ata_qc_free(struct ata_queued_cmd *qc)
tag = qc->tag;
if (likely(ata_tag_valid(tag))) {
qc->tag = ATA_TAG_POISON;
- clear_bit(tag, &ap->qc_allocated);
+ __ata_put_tag(ap, tag);
}
}
@@ -5656,6 +5671,18 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
if (!ap)
return NULL;
+ if (__percpu_ida_init(&ap->qc_tags,
+ ATA_MAX_QUEUE - 1, 4, 2) < 0) {
+ kfree(ap);
+ return NULL;
+ }
+
+ if (__percpu_ida_init(&ap->qc_internal, 1, 1, 1) < 0) {
+ percpu_ida_destroy(&ap->qc_tags);
+ kfree(ap);
+ return NULL;
+ }
+
ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN;
ap->lock = &host->lock;
ap->print_id = -1;
@@ -5695,6 +5722,15 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
return ap;
}
+static void ata_port_free(struct ata_port *ap)
+{
+ kfree(ap->pmp_link);
+ kfree(ap->slave_link);
+ percpu_ida_destroy(&ap->qc_tags);
+ percpu_ida_destroy(&ap->qc_internal);
+ kfree(ap);
+}
+
static void ata_host_release(struct device *gendev, void *res)
{
struct ata_host *host = dev_get_drvdata(gendev);
@@ -5709,9 +5745,7 @@ static void ata_host_release(struct device *gendev, void *res)
if (ap->scsi_host)
scsi_host_put(ap->scsi_host);
- kfree(ap->pmp_link);
- kfree(ap->slave_link);
- kfree(ap);
+ ata_port_free(ap);
host->ports[i] = NULL;
}
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 0e23c26..d673e91 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -38,6 +38,7 @@
#include <linux/acpi.h>
#include <linux/cdrom.h>
#include <linux/sched.h>
+#include <linux/percpu_ida.h>
/*
* Define if arch has non-standard setup. This is a _PCI_ standard
@@ -126,7 +127,7 @@ enum {
ATA_DEF_QUEUE = 1,
/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
ATA_MAX_QUEUE = 32,
- ATA_TAG_INTERNAL = ATA_MAX_QUEUE - 1,
+ ATA_TAG_INTERNAL = 0,
ATA_SHORT_PAUSE = 16,
ATAPI_MAX_DRAIN = 16 << 10,
@@ -816,7 +817,8 @@ struct ata_port {
unsigned int cbl; /* cable type; ATA_CBL_xxx */
struct ata_queued_cmd qcmd[ATA_MAX_QUEUE];
- unsigned long qc_allocated;
+ struct percpu_ida qc_internal;
+ struct percpu_ida qc_tags;
unsigned int qc_active;
int nr_active_links; /* #links with active qcs */
--
1.7.7.6
On Tue, Jan 21, 2014 at 04:02:26PM +0100, Alexander Gordeev wrote:
> Scanning ata_port::qc_allocated bitmask for free tags has
> been a performance bottleneck due to the cache line bouncing.
> This update replaces ata_port::qc_allocated bitmask with
> optimized for parallel access percpu_ida tags allocator.
Hi Tejun,
There is a discussion about imperfect percpu_ida handling of low nr of tags.
I think this change is too early to bring.
Thanks!
--
Regards,
Alexander Gordeev
[email protected]