2012-11-08 16:30:10

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 1/8] aoe: avoid running request handler on plugged queue

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoecmd.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index c491fba..3ce01f6 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -925,7 +925,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)

/* cf. http://lkml.org/lkml/2006/10/31/28 */
if (!fastfail)
- q->request_fn(q);
+ __blk_run_queue(q);
}

static void
--
1.7.1


2012-11-08 16:33:13

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 2/8] aoe: provide ATA identify device content to user on request

This patch makes the aoe driver follow expected behavior when
the user uses ioctl to get the ATA device identify information.

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoe.h | 1 +
drivers/block/aoe/aoeblk.c | 30 ++++++++++++++++++++++++++++++
drivers/block/aoe/aoecmd.c | 16 ++++++++++++++++
3 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 536942b..f6e0c03 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -169,6 +169,7 @@ struct aoedev {
struct aoetgt *htgt; /* target needing rexmit assistance */
ulong ntargets;
ulong kicked;
+ char ident[512];
};

/* kthread tracking */
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 56736cd..7ba0fcf 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -17,6 +17,7 @@
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/moduleparam.h>
+#include <scsi/sg.h>
#include "aoe.h"

static DEFINE_MUTEX(aoeblk_mutex);
@@ -212,9 +213,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}

+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+ struct aoedev *d;
+
+ if (!arg)
+ return -EINVAL;
+
+ d = bdev->bd_disk->private_data;
+ if ((d->flags & DEVFL_UP) == 0) {
+ pr_err("aoe: disk not up\n");
+ return -ENODEV;
+ }
+
+ if (cmd == HDIO_GET_IDENTITY) {
+ if (!copy_to_user((void __user *) arg, &d->ident,
+ sizeof(d->ident)))
+ return 0;
+ return -EFAULT;
+ }
+
+ /* udev calls scsi_id, which uses SG_IO, resulting in noise */
+ if (cmd != SG_IO)
+ pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+ return -ENOTTY;
+}
+
static const struct block_device_operations aoe_bdops = {
.open = aoeblk_open,
.release = aoeblk_release,
+ .ioctl = aoeblk_ioctl,
.getgeo = aoeblk_getgeo,
.owner = THIS_MODULE,
};
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 3ce01f6..c4ff70b 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -799,6 +799,17 @@ aoecmd_sleepwork(struct work_struct *work)
}

static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+ u16 s;
+
+ while (ns-- > 0) {
+ s = *id;
+ *id++ = s >> 8 | s << 8;
+ }
+}
+
+static void
ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
{
u64 ssize;
@@ -833,6 +844,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
}

+ ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
+ ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
+ ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
+ memcpy(d->ident, id, sizeof(d->ident));
+
if (d->ssize != ssize)
printk(KERN_INFO
"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
--
1.7.1

2012-11-08 16:35:05

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 3/8] aoe: improve network congestion handling

The aoe driver already had some congestion handling, but it was
limited in its ability to cope with the kind of congestion that
can arise on more complex networks such as those involving paths
through multiple ethernet switches.

Some of the lessons from TCP's history of development can be
applied to improving the congestion control and avoidance on AoE
storage networks. These changes use familar concepts from Van
Jacobson's "Congestion Avoidance and Control" paper from '88,
without adding significant overhead.

This patch depends on an upcoming patch that covers the failover
case when AoE commands being retransmitted are transferred from
one retransmit queue to another.

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoe.h | 16 +++--
drivers/block/aoe/aoecmd.c | 173 +++++++++++++++++++++++++++-----------------
drivers/block/aoe/aoedev.c | 6 +-
3 files changed, 121 insertions(+), 74 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index f6e0c03..9e884ac 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -86,8 +86,11 @@ enum {
NFACTIVE = 61,

TIMERTICK = HZ / 10,
- MINTIMER = HZ >> 2,
+ RTTSCALE = 8,
+ RTTDSCALE = 3,
MAXTIMER = HZ << 1,
+ RTTAVG_INIT = HZ / 4 << RTTSCALE,
+ RTTDEV_INIT = RTTAVG_INIT / 4,
};

struct buf {
@@ -127,10 +130,11 @@ struct aoetgt {
struct list_head ffree; /* list of free frames */
struct aoeif ifs[NAOEIFS];
struct aoeif *ifp; /* current aoeif in use */
- ushort nout;
+ ushort nout; /* value of nout when skb was sent */
ushort maxout; /* current value for max outstanding */
+ ushort next_cwnd; /* incr maxout after decrementing to zero */
+ ushort ssthresh; /* slow start threshold */
ulong falloc; /* number of allocated frames */
- ulong lastwadj; /* last window adjustment */
int minbcnt;
int wpkts, rpkts;
};
@@ -142,8 +146,8 @@ struct aoedev {
u16 aoeminor;
u16 flags;
u16 nopen; /* (bd_openers isn't available without sleeping) */
- u16 rttavg; /* round trip average of requests/responses */
- u16 mintimer;
+ u16 rttavg; /* scaled AoE round trip time average */
+ u16 rttdev; /* scaled round trip time mean deviation */
u16 fw_ver; /* version of blade's firmware */
u16 lasttag; /* last tag sent */
u16 useme;
@@ -164,6 +168,7 @@ struct aoedev {
} ip;
ulong maxbcnt;
struct list_head factive[NFACTIVE]; /* hash of active frames */
+ struct list_head rexmitq; /* deferred retransmissions */
struct aoetgt *targets[NTARGETS];
struct aoetgt **tgt; /* target in use when working */
struct aoetgt *htgt; /* target needing rexmit assistance */
@@ -196,6 +201,7 @@ void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
void aoecmd_cfg_rsp(struct sk_buff *);
void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
void aoecmd_cleanslate(struct aoedev *);
void aoecmd_exit(void);
int aoecmd_init(void);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index c4ff70b..f849fa2 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -59,6 +59,23 @@ new_skb(ulong len)
}

static struct frame *
+getframe_deferred(struct aoedev *d, u32 tag)
+{
+ struct list_head *head, *pos, *nx;
+ struct frame *f;
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
+ return f;
+ }
+ }
+ return NULL;
+}
+
+static struct frame *
getframe(struct aoedev *d, u32 tag)
{
struct frame *f;
@@ -553,10 +570,29 @@ sthtith(struct aoedev *d)
}

static void
+rexmit_deferred(struct aoedev *d)
+{
+ struct aoetgt *t;
+ struct frame *f;
+ struct list_head *pos, *nx, *head;
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ t = f->t;
+ if (t->nout >= t->maxout)
+ continue;
+ list_del(pos);
+ t->nout++;
+ resend(d, f);
+ }
+}
+
+static void
rexmit_timer(ulong vp)
{
struct aoedev *d;
- struct aoetgt *t, **tt, **te;
+ struct aoetgt *t;
struct aoeif *ifp;
struct frame *f;
struct list_head *head, *pos, *nx;
@@ -567,9 +603,11 @@ rexmit_timer(ulong vp)

d = (struct aoedev *) vp;

- /* timeout is always ~150% of the moving average */
- timeout = d->rttavg;
- timeout += timeout >> 1;
+ /* timeout based on observed timings and variations */
+ timeout = 2 * d->rttavg >> RTTSCALE;
+ timeout += 8 * d->rttdev >> RTTDSCALE;
+ if (timeout == 0)
+ timeout = 1;

spin_lock_irqsave(&d->lock, flags);

@@ -589,29 +627,12 @@ rexmit_timer(ulong vp)
list_move_tail(pos, &flist);
}
}
- /* window check */
- tt = d->targets;
- te = tt + d->ntargets;
- for (; tt < te && (t = *tt); tt++) {
- if (t->nout == t->maxout
- && t->maxout < t->nframes
- && (jiffies - t->lastwadj)/HZ > 10) {
- t->maxout++;
- t->lastwadj = jiffies;
- }
- }
-
- if (!list_empty(&flist)) { /* retransmissions necessary */
- n = d->rttavg <<= 1;
- if (n > MAXTIMER)
- d->rttavg = MAXTIMER;
- }

/* process expired frames */
while (!list_empty(&flist)) {
pos = flist.next;
f = list_entry(pos, struct frame, head);
- n = f->waited += timeout;
+ n = f->waited += tsince(f->tag);
n /= HZ;
if (n > aoe_deadsecs) {
/* Waited too long. Device failure.
@@ -620,18 +641,16 @@ rexmit_timer(ulong vp)
*/
list_splice(&flist, &d->factive[0]);
aoedev_downdev(d);
- break;
+ goto out;
}
- list_del(pos);

t = f->t;
if (n > aoe_deadsecs/2)
d->htgt = t; /* see if another target can help */

- if (t->nout == t->maxout) {
- if (t->maxout > 1)
- t->maxout--;
- t->lastwadj = jiffies;
+ if (t->maxout != 1) {
+ t->ssthresh = t->maxout / 2;
+ t->maxout = 1;
}

ifp = getif(t, f->skb->dev);
@@ -640,9 +659,12 @@ rexmit_timer(ulong vp)
ejectif(t, ifp);
ifp = NULL;
}
- resend(d, f);
+ list_move_tail(pos, &d->rexmitq);
+ t->nout--;
}
+ rexmit_deferred(d);

+out:
if ((d->flags & DEVFL_KICKME || d->htgt) && d->blkq) {
d->flags &= ~DEVFL_KICKME;
d->blkq->request_fn(d->blkq);
@@ -766,6 +788,7 @@ aoecmd_work(struct aoedev *d)
{
if (d->htgt && !sthtith(d))
return;
+ rexmit_deferred(d);
while (aoecmd_ata_rw(d))
;
}
@@ -868,26 +891,28 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
}

static void
-calc_rttavg(struct aoedev *d, int rtt)
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
{
register long n;

n = rtt;
- if (n < 0) {
- n = -rtt;
- if (n < MINTIMER)
- n = MINTIMER;
- else if (n > MAXTIMER)
- n = MAXTIMER;
- d->mintimer += (n - d->mintimer) >> 1;
- } else if (n < d->mintimer)
- n = d->mintimer;
- else if (n > MAXTIMER)
- n = MAXTIMER;
-
- /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
- n -= d->rttavg;
- d->rttavg += n >> 2;
+
+ /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+ n -= d->rttavg >> RTTSCALE;
+ d->rttavg += n;
+ if (n < 0)
+ n = -n;
+ n -= d->rttdev >> RTTDSCALE;
+ d->rttdev += n;
+
+ if (!t || t->maxout >= t->nframes)
+ return;
+ if (t->maxout < t->ssthresh)
+ t->maxout += 1;
+ else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+ t->maxout += 1;
+ t->next_cwnd = t->maxout;
+ }
}

static struct aoetgt *
@@ -1147,7 +1172,6 @@ aoecmd_ata_rsp(struct sk_buff *skb)
struct aoedev *d;
struct aoe_hdr *h;
struct frame *f;
- struct aoetgt *t;
u32 n;
ulong flags;
char ebuf[128];
@@ -1168,23 +1192,28 @@ aoecmd_ata_rsp(struct sk_buff *skb)

n = be32_to_cpu(get_unaligned(&h->tag));
f = getframe(d, n);
- if (f == NULL) {
- calc_rttavg(d, -tsince(n));
- spin_unlock_irqrestore(&d->lock, flags);
- aoedev_put(d);
- snprintf(ebuf, sizeof ebuf,
- "%15s e%d.%d tag=%08x@%08lx\n",
- "unexpected rsp",
- get_unaligned_be16(&h->major),
- h->minor,
- get_unaligned_be32(&h->tag),
- jiffies);
- aoechr_error(ebuf);
- return skb;
+ if (f) {
+ calc_rttavg(d, f->t, tsince(n));
+ f->t->nout--;
+ } else {
+ f = getframe_deferred(d, n);
+ if (f) {
+ calc_rttavg(d, NULL, tsince(n));
+ } else {
+ calc_rttavg(d, NULL, tsince(n));
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ snprintf(ebuf, sizeof(ebuf),
+ "%15s e%d.%d tag=%08x@%08lx\n",
+ "unexpected rsp",
+ get_unaligned_be16(&h->major),
+ h->minor,
+ get_unaligned_be32(&h->tag),
+ jiffies);
+ aoechr_error(ebuf);
+ return skb;
+ }
}
- t = f->t;
- calc_rttavg(d, tsince(f->tag));
- t->nout--;
aoecmd_work(d);

spin_unlock_irqrestore(&d->lock, flags);
@@ -1241,7 +1270,8 @@ aoecmd_ata_id(struct aoedev *d)

skb->dev = t->ifp->nd;

- d->rttavg = MAXTIMER;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->timer.function = rexmit_timer;

return skb_clone(skb, GFP_ATOMIC);
@@ -1273,7 +1303,7 @@ addtgt(struct aoedev *d, char *addr, ulong nframes)
t->d = d;
memcpy(t->addr, addr, sizeof t->addr);
t->ifp = t->ifs;
- t->maxout = t->nframes;
+ aoecmd_wreset(t);
INIT_LIST_HEAD(&t->ffree);
return *tt = t;
}
@@ -1382,7 +1412,7 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
if (t) {
t->nframes = n;
if (n < t->maxout)
- t->maxout = n;
+ aoecmd_wreset(t);
} else {
t = addtgt(d, h->src, n);
if (!t)
@@ -1412,17 +1442,26 @@ bail:
}

void
+aoecmd_wreset(struct aoetgt *t)
+{
+ t->maxout = 1;
+ t->ssthresh = t->nframes / 2;
+ t->next_cwnd = t->nframes;
+}
+
+void
aoecmd_cleanslate(struct aoedev *d)
{
struct aoetgt **t, **te;

- d->mintimer = MINTIMER;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->maxbcnt = 0;

t = d->targets;
te = t + NTARGETS;
for (; t < te && *t; t++)
- (*t)->maxout = (*t)->nframes;
+ aoecmd_wreset(*t);
}

void
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 63b2660..3c3aef2 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -198,7 +198,7 @@ aoedev_downdev(struct aoedev *d)
tt = d->targets;
te = tt + NTARGETS;
for (; tt < te && (t = *tt); tt++) {
- t->maxout = t->nframes;
+ aoecmd_wreset(t);
t->nout = 0;
}

@@ -391,10 +391,12 @@ aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
d->ref = 1;
for (i = 0; i < NFACTIVE; i++)
INIT_LIST_HEAD(&d->factive[i]);
+ INIT_LIST_HEAD(&d->rexmitq);
d->sysminor = sysminor;
d->aoemajor = maj;
d->aoeminor = min;
- d->mintimer = MINTIMER;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
d->next = devlist;
devlist = d;
out:
--
1.7.1

2012-11-08 16:43:06

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 4/8] aoe: err device: include MAC addresses for unexpected responses

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoecmd.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index f849fa2..6ea27fd 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1204,12 +1204,14 @@ aoecmd_ata_rsp(struct sk_buff *skb)
spin_unlock_irqrestore(&d->lock, flags);
aoedev_put(d);
snprintf(ebuf, sizeof(ebuf),
- "%15s e%d.%d tag=%08x@%08lx\n",
+ "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n",
"unexpected rsp",
get_unaligned_be16(&h->major),
h->minor,
get_unaligned_be32(&h->tag),
- jiffies);
+ jiffies,
+ h->src,
+ h->dst);
aoechr_error(ebuf);
return skb;
}
--
1.7.1

2012-11-08 16:45:04

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 5/8] aoe: manipulate aoedev network stats under lock

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoecmd.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 6ea27fd..9aefbe3 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -603,14 +603,14 @@ rexmit_timer(ulong vp)

d = (struct aoedev *) vp;

+ spin_lock_irqsave(&d->lock, flags);
+
/* timeout based on observed timings and variations */
timeout = 2 * d->rttavg >> RTTSCALE;
timeout += 8 * d->rttdev >> RTTDSCALE;
if (timeout == 0)
timeout = 1;

- spin_lock_irqsave(&d->lock, flags);
-
if (d->flags & DEVFL_TKILL) {
spin_unlock_irqrestore(&d->lock, flags);
return;
--
1.7.1

2012-11-08 16:47:04

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 6/8] aoe: use high-resolution RTTs with fallback to low-res

These changes improve the accuracy of the decision about whether
it's time to retransmit an AoE command by using the
microsecond-resolution gettimeofday instead of jiffies.

Because the system time can jump suddenly, the decision reverts
to using jiffies if the high-resolution time difference is
relatively large.

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoe.h | 9 ++++---
drivers/block/aoe/aoecmd.c | 57 ++++++++++++++++++++++++++++++++++++++-----
2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9e884ac..9fb68fc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -88,8 +88,7 @@ enum {
TIMERTICK = HZ / 10,
RTTSCALE = 8,
RTTDSCALE = 3,
- MAXTIMER = HZ << 1,
- RTTAVG_INIT = HZ / 4 << RTTSCALE,
+ RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
RTTDEV_INIT = RTTAVG_INIT / 4,
};

@@ -106,6 +105,8 @@ struct buf {
struct frame {
struct list_head head;
u32 tag;
+ struct timeval sent; /* high-res time packet was sent */
+ u32 sent_jiffs; /* low-res jiffies-based sent time */
ulong waited;
struct aoetgt *t; /* parent target I belong to */
sector_t lba;
@@ -143,11 +144,11 @@ struct aoedev {
struct aoedev *next;
ulong sysminor;
ulong aoemajor;
+ u32 rttavg; /* scaled AoE round trip time average */
+ u32 rttdev; /* scaled round trip time mean deviation */
u16 aoeminor;
u16 flags;
u16 nopen; /* (bd_openers isn't available without sleeping) */
- u16 rttavg; /* scaled AoE round trip time average */
- u16 rttdev; /* scaled round trip time mean deviation */
u16 fw_ver; /* version of blade's firmware */
u16 lasttag; /* last tag sent */
u16 useme;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 9aefbe3..a99220a 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -387,6 +387,8 @@ aoecmd_ata_rw(struct aoedev *d)
skb->dev = t->ifp->nd;
skb = skb_clone(skb, GFP_ATOMIC);
if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -475,12 +477,46 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
}

static int
+tsince_hr(struct frame *f)
+{
+ struct timeval now;
+ int n;
+
+ do_gettimeofday(&now);
+ n = now.tv_usec - f->sent.tv_usec;
+ n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+ if (n < 0)
+ n = -n;
+
+ /* For relatively long periods, use jiffies to avoid
+ * discrepancies caused by updates to the system time.
+ *
+ * On system with HZ of 1000, 32-bits is over 49 days
+ * worth of jiffies, or over 71 minutes worth of usecs.
+ *
+ * Jiffies overflow is handled by subtraction of unsigned ints:
+ * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+ * $3 = 4
+ * (gdb)
+ */
+ if (n > USEC_PER_SEC / 4) {
+ n = ((u32) jiffies) - f->sent_jiffs;
+ n *= USEC_PER_SEC / HZ;
+ }
+
+ return n;
+}
+
+static int
tsince(u32 tag)
{
int n;
@@ -489,7 +525,7 @@ tsince(u32 tag)
n -= tag & 0xffff;
if (n < 0)
n += 1<<16;
- return n;
+ return jiffies_to_usecs(n + 1);
}

static struct aoeif *
@@ -552,6 +588,7 @@ sthtith(struct aoedev *d)
nf->bv = f->bv;
nf->bv_off = f->bv_off;
nf->waited = 0;
+ nf->sent_jiffs = f->sent_jiffs;
f->skb = skb;
aoe_freetframe(f);
ht->nout--;
@@ -621,7 +658,7 @@ rexmit_timer(ulong vp)
head = &d->factive[i];
list_for_each_safe(pos, nx, head) {
f = list_entry(pos, struct frame, head);
- if (tsince(f->tag) < timeout)
+ if (tsince_hr(f) < timeout)
break; /* end of expired frames */
/* move to flist for later processing */
list_move_tail(pos, &flist);
@@ -632,8 +669,8 @@ rexmit_timer(ulong vp)
while (!list_empty(&flist)) {
pos = flist.next;
f = list_entry(pos, struct frame, head);
- n = f->waited += tsince(f->tag);
- n /= HZ;
+ n = f->waited += tsince_hr(f);
+ n /= USEC_PER_SEC;
if (n > aoe_deadsecs) {
/* Waited too long. Device failure.
* Hang all frames on first hash bucket for downdev
@@ -1193,12 +1230,12 @@ aoecmd_ata_rsp(struct sk_buff *skb)
n = be32_to_cpu(get_unaligned(&h->tag));
f = getframe(d, n);
if (f) {
- calc_rttavg(d, f->t, tsince(n));
+ calc_rttavg(d, f->t, tsince_hr(f));
f->t->nout--;
} else {
f = getframe_deferred(d, n);
if (f) {
- calc_rttavg(d, NULL, tsince(n));
+ calc_rttavg(d, NULL, tsince_hr(f));
} else {
calc_rttavg(d, NULL, tsince(n));
spin_unlock_irqrestore(&d->lock, flags);
@@ -1276,7 +1313,13 @@ aoecmd_ata_id(struct aoedev *d)
d->rttdev = RTTDEV_INIT;
d->timer.function = rexmit_timer;

- return skb_clone(skb, GFP_ATOMIC);
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ }
+
+ return skb;
}

static struct aoetgt *
--
1.7.1

2012-11-08 16:49:04

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 7/8] aoe: commands in retransmit queue use new destination on failure

When one remote MAC address isn't working as a destination for
AoE commands, the frames used to track information associated
with the AoE commands are moved to a new aoetgt (defined by the
tuple of {AoE major, AoE minor, target MAC address}).

This patch makes sure that the frames on the queue for
retransmits that need to be done are updated to use the new
destination, so that retransmits will be sent through a working
network path.

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoe.h | 1 +
drivers/block/aoe/aoecmd.c | 75 +++++++++++++++++++++++++++++++-------------
drivers/block/aoe/aoedev.c | 32 ++++++++++++------
3 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9fb68fc..c253cca 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -108,6 +108,7 @@ struct frame {
struct timeval sent; /* high-res time packet was sent */
u32 sent_jiffs; /* low-res jiffies-based sent time */
ulong waited;
+ ulong waited_total;
struct aoetgt *t; /* parent target I belong to */
sector_t lba;
struct sk_buff *skb; /* command skb freed on module exit */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index a99220a..d9bc6ff 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -352,6 +352,7 @@ aoecmd_ata_rw(struct aoedev *d)
fhash(f);
t->nout++;
f->waited = 0;
+ f->waited_total = 0;
f->buf = buf;
f->bcnt = bcnt;
f->lba = buf->sector;
@@ -556,46 +557,69 @@ ejectif(struct aoetgt *t, struct aoeif *ifp)
dev_put(nd);
}

+static struct frame *
+reassign_frame(struct list_head *pos)
+{
+ struct frame *f;
+ struct frame *nf;
+ struct sk_buff *skb;
+
+ f = list_entry(pos, struct frame, head);
+ nf = newframe(f->t->d);
+ if (!nf)
+ return NULL;
+
+ list_del(pos);
+
+ skb = nf->skb;
+ nf->skb = f->skb;
+ nf->buf = f->buf;
+ nf->bcnt = f->bcnt;
+ nf->lba = f->lba;
+ nf->bv = f->bv;
+ nf->bv_off = f->bv_off;
+ nf->waited = 0;
+ nf->waited_total = f->waited_total;
+ nf->sent = f->sent;
+ f->skb = skb;
+ aoe_freetframe(f);
+ f->t->nout--;
+ nf->t->nout++;
+
+ return nf;
+}
+
static int
sthtith(struct aoedev *d)
{
struct frame *f, *nf;
struct list_head *nx, *pos, *head;
- struct sk_buff *skb;
struct aoetgt *ht = d->htgt;
int i;

+ /* look through the active and pending retransmit frames */
for (i = 0; i < NFACTIVE; i++) {
head = &d->factive[i];
list_for_each_safe(pos, nx, head) {
f = list_entry(pos, struct frame, head);
if (f->t != ht)
continue;
-
- nf = newframe(d);
+ nf = reassign_frame(pos);
if (!nf)
return 0;
-
- /* remove frame from active list */
- list_del(pos);
-
- /* reassign all pertinent bits to new outbound frame */
- skb = nf->skb;
- nf->skb = f->skb;
- nf->buf = f->buf;
- nf->bcnt = f->bcnt;
- nf->lba = f->lba;
- nf->bv = f->bv;
- nf->bv_off = f->bv_off;
- nf->waited = 0;
- nf->sent_jiffs = f->sent_jiffs;
- f->skb = skb;
- aoe_freetframe(f);
- ht->nout--;
- nf->t->nout++;
resend(d, nf);
}
}
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->t != ht)
+ continue;
+ nf = reassign_frame(pos);
+ if (!nf)
+ return 0;
+ resend(d, nf);
+ }
/* We've cleaned up the outstanding so take away his
* interfaces so he won't be used. We should remove him from
* the target array here, but cleaning up a target is
@@ -612,6 +636,7 @@ rexmit_deferred(struct aoedev *d)
struct aoetgt *t;
struct frame *f;
struct list_head *pos, *nx, *head;
+ int since;

head = &d->rexmitq;
list_for_each_safe(pos, nx, head) {
@@ -621,6 +646,9 @@ rexmit_deferred(struct aoedev *d)
continue;
list_del(pos);
t->nout++;
+ since = tsince_hr(f);
+ f->waited += since;
+ f->waited_total += since;
resend(d, f);
}
}
@@ -637,6 +665,7 @@ rexmit_timer(ulong vp)
register long timeout;
ulong flags, n;
int i;
+ int since;

d = (struct aoedev *) vp;

@@ -669,7 +698,8 @@ rexmit_timer(ulong vp)
while (!list_empty(&flist)) {
pos = flist.next;
f = list_entry(pos, struct frame, head);
- n = f->waited += tsince_hr(f);
+ since = tsince_hr(f);
+ n = f->waited_total + since;
n /= USEC_PER_SEC;
if (n > aoe_deadsecs) {
/* Waited too long. Device failure.
@@ -1301,6 +1331,7 @@ aoecmd_ata_id(struct aoedev *d)
fhash(f);
t->nout++;
f->waited = 0;
+ f->waited_total = 0;

/* set up ata header */
ah->scnt = 1;
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 3c3aef2..91f7c99 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -170,30 +170,40 @@ aoe_failip(struct aoedev *d)
aoe_end_request(d, rq, 0);
}

+static void
+downdev_frame(struct list_head *pos)
+{
+ struct frame *f;
+
+ f = list_entry(pos, struct frame, head);
+ list_del(pos);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(f->t->d, f->buf);
+ }
+ aoe_freetframe(f);
+}
+
void
aoedev_downdev(struct aoedev *d)
{
struct aoetgt *t, **tt, **te;
- struct frame *f;
struct list_head *head, *pos, *nx;
struct request *rq;
int i;

d->flags &= ~DEVFL_UP;

- /* clean out active buffers */
+ /* clean out active and to-be-retransmitted buffers */
for (i = 0; i < NFACTIVE; i++) {
head = &d->factive[i];
- list_for_each_safe(pos, nx, head) {
- f = list_entry(pos, struct frame, head);
- list_del(pos);
- if (f->buf) {
- f->buf->nframesout--;
- aoe_failbuf(d, f->buf);
- }
- aoe_freetframe(f);
- }
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
}
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
+
/* reset window dressings */
tt = d->targets;
te = tt + NTARGETS;
--
1.7.1

2012-11-08 16:51:04

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 8/8] aoe: update driver-internal version to 64+

Signed-off-by: Ed Cashin <[email protected]>
---
drivers/block/aoe/aoe.h | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index c253cca..9655ce3 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,6 @@
+
/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
-#define VERSION "60"
+#define VERSION "64+"
#define AOE_MAJOR 152
#define DEVICE_NAME "aoe"

--
1.7.1

2012-11-08 19:26:28

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/8] aoe: avoid running request handler on plugged queue

On Thu, 8 Nov 2012 11:29:32 -0500
Ed Cashin <[email protected]> wrote:

> Signed-off-by: Ed Cashin <[email protected]>

Could you please prepare decent changelogs for the patches? Several of
these appear to be bugfixes but we have no description of the
user-visible behavioural changes. So nobody knows what the patches do,
nor which kernel version(s) they should be merged into.

Thanks.

2012-11-08 19:32:51

by Ed L. Cashin

[permalink] [raw]
Subject: Re: [PATCH 1/8] aoe: avoid running request handler on plugged queue

On Nov 8, 2012, at 2:26 PM, Andrew Morton wrote:

> On Thu, 8 Nov 2012 11:29:32 -0500
> Ed Cashin <[email protected]> wrote:
>
>> Signed-off-by: Ed Cashin <[email protected]>
>
> Could you please prepare decent changelogs for the patches? Several of
> these appear to be bugfixes but we have no description of the
> user-visible behavioural changes. So nobody knows what the patches do,
> nor which kernel version(s) they should be merged into.

Yes, sure. For this one, I Cc'ed Jens Axboe in part because although I believe that it is correct to use __blk_run_queue rather than to call the request handler directly, I am going on old information. I'm not entirely sure what the user impact is when the request hander is called directly, other than perhaps a slight performance hit.

But I'd like to do the right thing. Jens Axboe, if you have a comment about correctness or user impact, I'd like to include that information in the changelog for this commit. Or if the change doesn't look right to you, I'd like to know that.

Thanks.

--
Ed Cashin
[email protected]

2012-11-09 00:40:54

by Jeff Garzik

[permalink] [raw]
Subject: Re: [PATCH 2/8] aoe: provide ATA identify device content to user on request

On 11/08/2012 11:32 AM, Ed Cashin wrote:
> This patch makes the aoe driver follow expected behavior when
> the user uses ioctl to get the ATA device identify information.
>
> Signed-off-by: Ed Cashin <[email protected]>
> ---
> drivers/block/aoe/aoe.h | 1 +
> drivers/block/aoe/aoeblk.c | 30 ++++++++++++++++++++++++++++++
> drivers/block/aoe/aoecmd.c | 16 ++++++++++++++++
> 3 files changed, 47 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
> index 536942b..f6e0c03 100644
> --- a/drivers/block/aoe/aoe.h
> +++ b/drivers/block/aoe/aoe.h
> @@ -169,6 +169,7 @@ struct aoedev {
> struct aoetgt *htgt; /* target needing rexmit assistance */
> ulong ntargets;
> ulong kicked;
> + char ident[512];
> };
>
> /* kthread tracking */
> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
> index 56736cd..7ba0fcf 100644
> --- a/drivers/block/aoe/aoeblk.c
> +++ b/drivers/block/aoe/aoeblk.c
> @@ -17,6 +17,7 @@
> #include <linux/mutex.h>
> #include <linux/export.h>
> #include <linux/moduleparam.h>
> +#include <scsi/sg.h>
> #include "aoe.h"
>
> static DEFINE_MUTEX(aoeblk_mutex);
> @@ -212,9 +213,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
> return 0;
> }
>
> +static int
> +aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
> +{
> + struct aoedev *d;
> +
> + if (!arg)
> + return -EINVAL;
> +
> + d = bdev->bd_disk->private_data;
> + if ((d->flags & DEVFL_UP) == 0) {
> + pr_err("aoe: disk not up\n");
> + return -ENODEV;
> + }
> +
> + if (cmd == HDIO_GET_IDENTITY) {
> + if (!copy_to_user((void __user *) arg, &d->ident,
> + sizeof(d->ident)))
> + return 0;
> + return -EFAULT;
> + }
> +
> + /* udev calls scsi_id, which uses SG_IO, resulting in noise */
> + if (cmd != SG_IO)
> + pr_info("aoe: unknown ioctl 0x%x\n", cmd);
> +
> + return -ENOTTY;
> +}
> +
> static const struct block_device_operations aoe_bdops = {
> .open = aoeblk_open,
> .release = aoeblk_release,
> + .ioctl = aoeblk_ioctl,
> .getgeo = aoeblk_getgeo,
> .owner = THIS_MODULE,
> };
> diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
> index 3ce01f6..c4ff70b 100644
> --- a/drivers/block/aoe/aoecmd.c
> +++ b/drivers/block/aoe/aoecmd.c
> @@ -799,6 +799,17 @@ aoecmd_sleepwork(struct work_struct *work)
> }
>
> static void
> +ata_ident_fixstring(u16 *id, int ns)
> +{
> + u16 s;
> +
> + while (ns-- > 0) {
> + s = *id;
> + *id++ = s >> 8 | s << 8;
> + }
> +}
> +
> +static void
> ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
> {
> u64 ssize;
> @@ -833,6 +844,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
> d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
> }
>
> + ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
> + ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
> + ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */

This duplicates ata_id_string() and/or ata_id_c_string(), does it not?


2012-11-09 02:10:58

by Ed L. Cashin

[permalink] [raw]
Subject: Re: [PATCH 2/8] aoe: provide ATA identify device content to user on request

On Nov 8, 2012, at 7:40 PM, Jeff Garzik wrote:

> On 11/08/2012 11:32 AM, Ed Cashin wrote:
>> This patch makes the aoe driver follow expected behavior when
>> the user uses ioctl to get the ATA device identify information.
>>
>> Signed-off-by: Ed Cashin <[email protected]>
>> ---
>> drivers/block/aoe/aoe.h | 1 +
>> drivers/block/aoe/aoeblk.c | 30 ++++++++++++++++++++++++++++++
>> drivers/block/aoe/aoecmd.c | 16 ++++++++++++++++
>> 3 files changed, 47 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
>> index 536942b..f6e0c03 100644
>> --- a/drivers/block/aoe/aoe.h
>> +++ b/drivers/block/aoe/aoe.h
>> @@ -169,6 +169,7 @@ struct aoedev {
>> struct aoetgt *htgt; /* target needing rexmit assistance */
>> ulong ntargets;
>> ulong kicked;
>> + char ident[512];
>> };
>>
>> /* kthread tracking */
>> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
>> index 56736cd..7ba0fcf 100644
>> --- a/drivers/block/aoe/aoeblk.c
>> +++ b/drivers/block/aoe/aoeblk.c
>> @@ -17,6 +17,7 @@
>> #include <linux/mutex.h>
>> #include <linux/export.h>
>> #include <linux/moduleparam.h>
>> +#include <scsi/sg.h>
>> #include "aoe.h"
>>
>> static DEFINE_MUTEX(aoeblk_mutex);
>> @@ -212,9 +213,38 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
>> return 0;
>> }
>>
>> +static int
>> +aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
>> +{
>> + struct aoedev *d;
>> +
>> + if (!arg)
>> + return -EINVAL;
>> +
>> + d = bdev->bd_disk->private_data;
>> + if ((d->flags & DEVFL_UP) == 0) {
>> + pr_err("aoe: disk not up\n");
>> + return -ENODEV;
>> + }
>> +
>> + if (cmd == HDIO_GET_IDENTITY) {
>> + if (!copy_to_user((void __user *) arg, &d->ident,
>> + sizeof(d->ident)))
>> + return 0;
>> + return -EFAULT;
>> + }
>> +
>> + /* udev calls scsi_id, which uses SG_IO, resulting in noise */
>> + if (cmd != SG_IO)
>> + pr_info("aoe: unknown ioctl 0x%x\n", cmd);
>> +
>> + return -ENOTTY;
>> +}
>> +
>> static const struct block_device_operations aoe_bdops = {
>> .open = aoeblk_open,
>> .release = aoeblk_release,
>> + .ioctl = aoeblk_ioctl,
>> .getgeo = aoeblk_getgeo,
>> .owner = THIS_MODULE,
>> };
>> diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
>> index 3ce01f6..c4ff70b 100644
>> --- a/drivers/block/aoe/aoecmd.c
>> +++ b/drivers/block/aoe/aoecmd.c
>> @@ -799,6 +799,17 @@ aoecmd_sleepwork(struct work_struct *work)
>> }
>>
>> static void
>> +ata_ident_fixstring(u16 *id, int ns)
>> +{
>> + u16 s;
>> +
>> + while (ns-- > 0) {
>> + s = *id;
>> + *id++ = s >> 8 | s << 8;
>> + }
>> +}
>> +
>> +static void
>> ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
>> {
>> u64 ssize;
>> @@ -833,6 +844,11 @@ ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
>> d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
>> }
>>
>> + ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
>> + ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
>> + ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
>
> This duplicates ata_id_string() and/or ata_id_c_string(), does it not?


They're similar, yes, but aoecmd.c:ata_ident_fixstring performs the byte swapping
in place, avoiding the need for any on-stack or other memory allocation. The code
is pretty readable now as a simple in-place byte swap, and I'm concerned that the
extra mechanics of buffer allocation, any allocation failure handling, and memmov-ing
the results back into the id array wouldn't simplify the code if aoecmd.c tried to use
ata_id_string as it is.

--
Ed Cashin
[email protected]