2006-08-18 18:54:47

by Ed L. Cashin

[permalink] [raw]
Subject: [PATCH 2.6.18-rc4] aoe [04/13]: zero copy write 1 of 2

Signed-off-by: "Ed L. Cashin" <[email protected]>

Avoid memory copy on writes.
(This patch depends on fixes in patch 9 to follow.)

diff -upr 2.6.18-rc4-orig/drivers/block/aoe/aoe.h 2.6.18-rc4-aoe/drivers/block/aoe/aoe.h
--- 2.6.18-rc4-orig/drivers/block/aoe/aoe.h 2006-08-17 16:45:34.000000000 -0400
+++ 2.6.18-rc4-aoe/drivers/block/aoe/aoe.h 2006-08-17 16:45:34.000000000 -0400
@@ -107,11 +107,7 @@ struct frame {
ulong waited;
struct buf *buf;
char *bufaddr;
- int writedatalen;
- int ndata;
-
- /* largest possible */
- unsigned char data[sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr)];
+ struct sk_buff *skb;
};

struct aoedev {
@@ -157,6 +153,7 @@ void aoecmd_cfg(ushort aoemajor, unsigne
void aoecmd_ata_rsp(struct sk_buff *);
void aoecmd_cfg_rsp(struct sk_buff *);
void aoecmd_sleepwork(void *vp);
+struct sk_buff *new_skb(ulong);

int aoedev_init(void);
void aoedev_exit(void);
diff -upr 2.6.18-rc4-orig/drivers/block/aoe/aoecmd.c 2.6.18-rc4-aoe/drivers/block/aoe/aoecmd.c
--- 2.6.18-rc4-orig/drivers/block/aoe/aoecmd.c 2006-08-17 16:45:34.000000000 -0400
+++ 2.6.18-rc4-aoe/drivers/block/aoe/aoecmd.c 2006-08-17 16:45:34.000000000 -0400
@@ -17,15 +17,14 @@
#define MAXTIMER (HZ << 1)
#define MAXWAIT (60 * 3) /* After MAXWAIT seconds, give up and fail dev */

-static struct sk_buff *
-new_skb(struct net_device *if_dev, ulong len)
+struct sk_buff *
+new_skb(ulong len)
{
struct sk_buff *skb;

skb = alloc_skb(len, GFP_ATOMIC);
if (skb) {
skb->nh.raw = skb->mac.raw = skb->data;
- skb->dev = if_dev;
skb->protocol = __constant_htons(ETH_P_AOE);
skb->priority = 0;
skb_put(skb, len);
@@ -40,29 +39,6 @@ new_skb(struct net_device *if_dev, ulong
return skb;
}

-static struct sk_buff *
-skb_prepare(struct aoedev *d, struct frame *f)
-{
- struct sk_buff *skb;
- char *p;
-
- skb = new_skb(d->ifp, f->ndata + f->writedatalen);
- if (!skb) {
- printk(KERN_INFO "aoe: skb_prepare: failure to allocate skb\n");
- return NULL;
- }
-
- p = skb->mac.raw;
- memcpy(p, f->data, f->ndata);
-
- if (f->writedatalen) {
- p += sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
- memcpy(p, f->bufaddr, f->writedatalen);
- }
-
- return skb;
-}
-
static struct frame *
getframe(struct aoedev *d, int tag)
{
@@ -129,10 +105,11 @@ aoecmd_ata_rw(struct aoedev *d, struct f
bcnt = MAXATADATA;

/* initialize the headers & frame */
- h = (struct aoe_hdr *) f->data;
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb->mac.raw;
ah = (struct aoe_atahdr *) (h+1);
- f->ndata = sizeof *h + sizeof *ah;
- memset(h, 0, f->ndata);
+ skb->len = sizeof *h + sizeof *ah;
+ memset(h, 0, skb->len);
f->tag = aoehdr_atainit(d, h);
f->waited = 0;
f->buf = buf;
@@ -155,11 +132,13 @@ aoecmd_ata_rw(struct aoedev *d, struct f
}

if (bio_data_dir(buf->bio) == WRITE) {
+ skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
+ offset_in_page(f->bufaddr), bcnt);
ah->aflags |= AOEAFL_WRITE;
- f->writedatalen = bcnt;
} else {
+ skb_shinfo(skb)->nr_frags = 0;
+ skb->len = ETH_ZLEN;
writebit = 0;
- f->writedatalen = 0;
}

ah->cmdstat = WIN_READ | writebit | extbit;
@@ -179,15 +158,14 @@ aoecmd_ata_rw(struct aoedev *d, struct f
buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset;
}

- skb = skb_prepare(d, f);
- if (skb) {
- skb->next = NULL;
- if (d->sendq_hd)
- d->sendq_tl->next = skb;
- else
- d->sendq_hd = skb;
- d->sendq_tl = skb;
- }
+ skb->dev = d->ifp;
+ skb_get(skb);
+ skb->next = NULL;
+ if (d->sendq_hd)
+ d->sendq_tl->next = skb;
+ else
+ d->sendq_hd = skb;
+ d->sendq_tl = skb;
}

/* some callers cannot sleep, and they can call this function,
@@ -209,11 +187,12 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigne
if (!is_aoe_netif(ifp))
continue;

- skb = new_skb(ifp, sizeof *h + sizeof *ch);
+ skb = new_skb(sizeof *h + sizeof *ch);
if (skb == NULL) {
printk(KERN_INFO "aoe: aoecmd_cfg: skb alloc failure\n");
continue;
}
+ skb->dev = ifp;
if (sl_tail == NULL)
sl_tail = skb;
h = (struct aoe_hdr *) skb->mac.raw;
@@ -283,21 +262,21 @@ rexmit(struct aoedev *d, struct frame *f
d->aoemajor, d->aoeminor, f->tag, jiffies, n);
aoechr_error(buf);

- h = (struct aoe_hdr *) f->data;
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb->mac.raw;
f->tag = n;
h->tag = cpu_to_be32(n);
memcpy(h->dst, d->addr, sizeof h->dst);
memcpy(h->src, d->ifp->dev_addr, sizeof h->src);

- skb = skb_prepare(d, f);
- if (skb) {
- skb->next = NULL;
- if (d->sendq_hd)
- d->sendq_tl->next = skb;
- else
- d->sendq_hd = skb;
- d->sendq_tl = skb;
- }
+ skb->dev = d->ifp;
+ skb_get(skb);
+ skb->next = NULL;
+ if (d->sendq_hd)
+ d->sendq_tl->next = skb;
+ else
+ d->sendq_hd = skb;
+ d->sendq_tl = skb;
}

static int
@@ -514,7 +493,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
calc_rttavg(d, tsince(f->tag));

ahin = (struct aoe_atahdr *) (hin+1);
- ahout = (struct aoe_atahdr *) (f->data + sizeof(struct aoe_hdr));
+ ahout = (struct aoe_atahdr *) (f->skb->mac.raw + sizeof(struct aoe_hdr));
buf = f->buf;

if (ahout->cmdstat == WIN_IDENTIFY)
@@ -620,20 +599,21 @@ aoecmd_ata_id(struct aoedev *d)
}

/* initialize the headers & frame */
- h = (struct aoe_hdr *) f->data;
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb->mac.raw;
ah = (struct aoe_atahdr *) (h+1);
- f->ndata = sizeof *h + sizeof *ah;
- memset(h, 0, f->ndata);
+ skb->len = sizeof *h + sizeof *ah;
+ memset(h, 0, skb->len);
f->tag = aoehdr_atainit(d, h);
f->waited = 0;
- f->writedatalen = 0;

/* set up ata header */
ah->scnt = 1;
ah->cmdstat = WIN_IDENTIFY;
ah->lba3 = 0xa0;

- skb = skb_prepare(d, f);
+ skb->dev = d->ifp;
+ skb_get(skb);

d->rttavg = MAXTIMER;
d->timer.function = rexmit_timer;
diff -upr 2.6.18-rc4-orig/drivers/block/aoe/aoedev.c 2.6.18-rc4-aoe/drivers/block/aoe/aoedev.c
--- 2.6.18-rc4-orig/drivers/block/aoe/aoedev.c 2006-08-17 16:45:34.000000000 -0400
+++ 2.6.18-rc4-aoe/drivers/block/aoe/aoedev.c 2006-08-17 16:45:34.000000000 -0400
@@ -63,22 +63,32 @@ aoedev_newdev(ulong nframes)
struct frame *f, *e;

d = kzalloc(sizeof *d, GFP_ATOMIC);
- if (d == NULL)
- return NULL;
f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
- if (f == NULL) {
- kfree(d);
+ switch (!d || !f) {
+ case 0:
+ d->nframes = nframes;
+ d->frames = f;
+ e = f + nframes;
+ for (; f<e; f++) {
+ f->tag = FREETAG;
+ f->skb = new_skb(ETH_ZLEN);
+ if (!f->skb)
+ break;
+ }
+ if (f == e)
+ break;
+ while (f > d->frames) {
+ f--;
+ dev_kfree_skb(f->skb);
+ }
+ default:
+ if (f)
+ kfree(f);
+ if (d)
+ kfree(d);
return NULL;
}
-
INIT_WORK(&d->work, aoecmd_sleepwork, d);
-
- d->nframes = nframes;
- d->frames = f;
- e = f + nframes;
- for (; f<e; f++)
- f->tag = FREETAG;
-
spin_lock_init(&d->lock);
init_timer(&d->timer);
d->timer.data = (ulong) d;
@@ -160,11 +170,19 @@ aoedev_by_sysminor_m(ulong sysminor, ulo
static void
aoedev_freedev(struct aoedev *d)
{
+ struct frame *f, *e;
+
if (d->gd) {
aoedisk_rm_sysfs(d);
del_gendisk(d->gd);
put_disk(d->gd);
}
+ f = d->frames;
+ e = f + d->nframes;
+ for (; f<e; f++) {
+ skb_shinfo(f->skb)->nr_frags = 0;
+ dev_kfree_skb(f->skb);
+ }
kfree(d->frames);
if (d->bufpool)
mempool_destroy(d->bufpool);


--
"Ed L. Cashin" <[email protected]>


2006-08-20 15:38:17

by Alan

[permalink] [raw]
Subject: Re: [PATCH 2.6.18-rc4] aoe [04/13]: zero copy write 1 of 2

Ar Gwe, 2006-08-18 am 13:39 -0400, ysgrifennodd Ed L. Cashin:
> Signed-off-by: "Ed L. Cashin" <[email protected]>

> + skb->len = sizeof *h + sizeof *ah;
> + memset(h, 0, skb->len);

Never play with skb->len directly. Use skb_put/skb_trim


2006-08-22 22:51:24

by Ed L. Cashin

[permalink] [raw]
Subject: Re: [PATCH 2.6.18-rc4] aoe [04/13]: zero copy write 1 of 2

On Sat, Aug 19, 2006 at 11:18:12AM +0100, Alan Cox wrote:
> Ar Gwe, 2006-08-18 am 13:39 -0400, ysgrifennodd Ed L. Cashin:
> > Signed-off-by: "Ed L. Cashin" <[email protected]>
>
> > + skb->len = sizeof *h + sizeof *ah;
> > + memset(h, 0, skb->len);
>
> Never play with skb->len directly. Use skb_put/skb_trim

These are skbs pre-allocated by the aoe driver that will always have
enough room to accomodate this much data, and we are really setting
the packet header length.

To use skb_put here seems awkward. We'd have to do things like shown
below throughout the driver instead of just setting the length. Is
that what you'd like to see?

diff -upr 2.6.18-rc4-orig/drivers/block/aoe/aoecmd.c 2.6.18-rc4-aoe/drivers/block/aoe/aoecmd.c
--- 2.6.18-rc4-orig/drivers/block/aoe/aoecmd.c 2006-08-22 12:48:18.000000000 -0400
+++ 2.6.18-rc4-aoe/drivers/block/aoe/aoecmd.c 2006-08-22 17:03:23.000000000 -0400
@@ -314,7 +315,9 @@ rexmit(struct aoedev *d, struct frame *f
if (ah->aflags & AOEAFL_WRITE) {
skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
offset_in_page(f->bufaddr), DEFAULTBCNT);
- skb->len = sizeof *h + sizeof *ah + DEFAULTBCNT;
+ skb->data_len = 0;
+ skb_trim(skb, 0);
+ skb_put(skb, sizeof *h + sizeof *ah + DEFAULTBCNT);
skb->data_len = DEFAULTBCNT;
}
if (++d->lostjumbo > (d->nframes << 1))


--
Ed L Cashin <[email protected]>

2006-08-23 14:41:42

by Alan

[permalink] [raw]
Subject: Re: [PATCH 2.6.18-rc4] aoe [04/13]: zero copy write 1 of 2

Ar Maw, 2006-08-22 am 17:21 -0400, ysgrifennodd Ed L. Cashin:
> These are skbs pre-allocated by the aoe driver that will always have
> enough room to accomodate this much data, and we are really setting
> the packet header length.

The skb structure has other fields and if you fiddle with them by hand
you break those and you end up breaking if the skb internals change.

Eg if you set skb->len you must set skb->tail. Functions like
skb_addd_data, skb_put, skb_trim, etc do the right thing in all cases.

> To use skb_put here seems awkward. We'd have to do things like shown
> below throughout the driver instead of just setting the length. Is
> that what you'd like to see?

Yes. It might take a clock or two longer but it sets skb->tail right and
is rather more future proof.

2006-09-14 20:07:44

by Ed L. Cashin

[permalink] [raw]
Subject: Re: [PATCH 2.6.18-rc4] aoe [04/13]: zero copy write 1 of 2

[email protected]> <[email protected]> <20060822212150.
[email protected]> <[email protected]>
In-Reply-To: <[email protected]>
User-Agent: Mutt/1.5.11+cvs20060126

Hi. Back in August I sent out some patches for the aoe driver, and
Alan objected to the direct setting of skb->len in one of them. I
asked whether he was suggesting that we use something like this
instead of setting skb->len:

skb->data_len = 0;
skb_trim(skb, 0);
skb_put(skb, sizeof *h + sizeof *ah + DEFAULTBCNT);

... and Alan said that was acceptible because it takes care of
skb->tail, checks for overflow, and is more future proof.

So I took some time to implement the necessary changes, but then it
became apparent that there was a problem.

The skb_trim and skb_put macros are only for non-linear skbuffs, but
we are only using the area inside the skbuff itself for packet
headers, not data.

When we do something like this:

if (bio_data_dir(buf->bio) == WRITE) {
skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
ah.aflags |= AOEAFL_WRITE;
skb->len += bcnt;
skb->data_len = bcnt;
t->wpkts++;

... skb->tail isn't really relevant, because we are only using the
pre-allocated part of the skbuff for headers, and the headers aren't
expanding here. Other parts of the kernel that aren't putting data in
the skbuff itself set the skb->len directly.

e1000_main.c
ip_output.c
tcp.c
ip6_output.c

So is it correct for the callers of skb_fill_page_desc to set skb->len
or is another interface needed besides skb_put/skb_trim? Such a new
interface would be able to maintain the consistency of the skbuff
fields without assuming that the data is in the skbuff itself.

If a new interface is needed, then it seems like we should set
skb->len in this patch until the new interface is ready.

--
Ed L Cashin <[email protected]>