2009-10-14 15:59:33

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 0/9] ZC/L4RO enhancements to alacrityvm::vbus-enet driver

The following series applies to the "linux-next" branch in the
alacrityvm tree:

git://git.kernel.org/pub/scm/linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git

These patches add support for zero-copy, and reassembly-offloading to the
venet driver. This means we can transmit a guest GSO packet directly into
the host hardware, and receive fully reassembled LRO frames without
artificially segmenting them.

Unofficial testing against a ZC/L4RO capable backend show that we are
supporting about 6.6Gb/s in throughput (vs 7.3Gb/s for native) which is
up from the prior result of 5.7Gb/s without affecting our latency numbers.
I will officially re-run my tests and update the graphs asap.

http://developer.novell.com/wiki/index.php/AlacrityVM

Kind Regards,
-Greg

---

Gregory Haskins (9):
venet: add Layer-4 Reassembler Offload (L4RO) support
venet: add a tx-complete event for out-of-order support
venet: use an skblist for outstanding descriptors
venet: add eventq protocol
venet: cache the ringlen values at init
venet: report actual used descriptor size
venet: add pre-mapped tx descriptor feature
venet: fix gso.hdr_len to report correct length
venet: Update maintainer


MAINTAINERS | 7
drivers/net/vbus-enet.c | 770 +++++++++++++++++++++++++++++++++++++++++++----
include/linux/venet.h | 61 +++-
3 files changed, 764 insertions(+), 74 deletions(-)


2009-10-14 15:59:46

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 1/9] venet: Update maintainer

Signed-off-by: Gregory Haskins <[email protected]>
---

MAINTAINERS | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index fe97eb1..55fabad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5617,6 +5617,13 @@ S: Maintained
F: include/linux/vbus*
F: drivers/vbus/*

+VBUS ETHERNET DRIVER
+M: Gregory Haskins <[email protected]>
+S: Maintained
+W: http://developer.novell.com/wiki/index.php/AlacrityVM
+F: include/linux/venet.h
+F: drivers/net/vbus-enet.c
+
VFAT/FAT/MSDOS FILESYSTEM
M: OGAWA Hirofumi <[email protected]>
S: Maintained

2009-10-14 15:59:56

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 2/9] venet: fix gso.hdr_len to report correct length

This seemed to have worked for TSO4/6 frames, but breaks for UFO. In
either case, its just plain wrong, so lets get the header set properly.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 91c47a9..3d61444 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -512,7 +512,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)

vsg->flags |= VENET_SG_FLAG_GSO;

- vsg->gso.hdrlen = skb_transport_header(skb) - skb->data;
+ vsg->gso.hdrlen = skb_headlen(skb);
vsg->gso.size = sinfo->gso_size;
if (sinfo->gso_type & SKB_GSO_TCPV4)
vsg->gso.type = VENET_GSO_TYPE_TCPV4;

2009-10-14 16:04:35

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 3/9] venet: add pre-mapped tx descriptor feature

What: Pre-allocate and map our scatter-gather descriptors.

Why: The host cannot directly access guest memory, and therefore any
indirection adds additional overhead. We currently implement
scattergather by pushing a pointer to the sg-descriptor, which points
to the actual SKB. This means the host must take an extra read
just to obtain the pointer to the SKB data.

Therefore we introduce a new shared-memory region that consists of
pre-allocated scattergather descriptors. The host may then decode
a descriptor pointer as an offset to this pre-mapped region and
save time/overhead.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 62 +++++++++++++++++++++++++++++++++++++++++------
include/linux/venet.h | 12 +++++----
2 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 3d61444..b3e9695 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -61,6 +61,10 @@ struct vbus_enet_priv {
struct vbus_enet_queue txq;
struct tasklet_struct txtask;
bool sg;
+ struct {
+ bool enabled;
+ char *pool;
+ } pmtd; /* pre-mapped transmit descriptors */
};

static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
@@ -201,7 +205,9 @@ rx_teardown(struct vbus_enet_priv *priv)
static int
tx_setup(struct vbus_enet_priv *priv)
{
- struct ioq *ioq = priv->txq.queue;
+ struct ioq *ioq = priv->txq.queue;
+ size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
+ size_t len = sizeof(struct venet_sg) + iovlen;
struct ioq_iterator iter;
int i;
int ret;
@@ -213,6 +219,29 @@ tx_setup(struct vbus_enet_priv *priv)
*/
return 0;

+ /* pre-allocate our descriptor pool if pmtd is enabled */
+ if (priv->pmtd.enabled) {
+ struct vbus_device_proxy *dev = priv->vdev;
+ size_t poollen = len * tx_ringlen;
+ char *pool;
+ int shmid;
+
+ /* pmtdquery will return the shm-id to use for the pool */
+ ret = devcall(priv, VENET_FUNC_PMTDQUERY, NULL, 0);
+ BUG_ON(ret < 0);
+
+ shmid = ret;
+
+ pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+ if (!pool)
+ return -ENOMEM;
+
+ priv->pmtd.pool = pool;
+
+ ret = dev->ops->shm(dev, shmid, 0, pool, poollen, 0, NULL, 0);
+ BUG_ON(ret < 0);
+ }
+
ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
BUG_ON(ret < 0);

@@ -224,16 +253,22 @@ tx_setup(struct vbus_enet_priv *priv)
*/
for (i = 0; i < tx_ringlen; i++) {
struct venet_sg *vsg;
- size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
- size_t len = sizeof(*vsg) + iovlen;

- vsg = kzalloc(len, GFP_KERNEL);
- if (!vsg)
- return -ENOMEM;
+ if (priv->pmtd.enabled) {
+ size_t offset = (i * len);
+
+ vsg = (struct venet_sg *)&priv->pmtd.pool[offset];
+ iter.desc->ptr = (u64)offset;
+ } else {
+ vsg = kzalloc(len, GFP_KERNEL);
+ if (!vsg)
+ return -ENOMEM;
+
+ iter.desc->ptr = (u64)__pa(vsg);
+ }

iter.desc->cookie = (u64)vsg;
iter.desc->len = len;
- iter.desc->ptr = (u64)__pa(vsg);

ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
BUG_ON(ret < 0);
@@ -259,6 +294,14 @@ tx_teardown(struct vbus_enet_priv *priv)
*/
return;

+ if (priv->pmtd.enabled) {
+ /*
+ * PMTD mode means we only need to free the pool
+ */
+ kfree(priv->pmtd.pool);
+ return;
+ }
+
ret = ioq_iter_init(ioq, &iter, ioq_idxtype_valid, 0);
BUG_ON(ret < 0);

@@ -705,7 +748,7 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
if (sg_enabled) {
caps.gid = VENET_CAP_GROUP_SG;
caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
- |VENET_CAP_ECN);
+ |VENET_CAP_ECN|VENET_CAP_PMTD);
/* note: exclude UFO for now due to stack bug */
}

@@ -726,6 +769,9 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
dev->features |= NETIF_F_TSO6;
if (caps.bits & VENET_CAP_ECN)
dev->features |= NETIF_F_TSO_ECN;
+
+ if (caps.bits & VENET_CAP_PMTD)
+ priv->pmtd.enabled = true;
}

return 0;
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 47ed37d..57aeddd 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -45,6 +45,7 @@ struct venet_capabilities {
#define VENET_CAP_TSO6 (1 << 2)
#define VENET_CAP_ECN (1 << 3)
#define VENET_CAP_UFO (1 << 4)
+#define VENET_CAP_PMTD (1 << 5) /* pre-mapped tx desc */

struct venet_iov {
__u32 len;
@@ -75,10 +76,11 @@ struct venet_sg {
struct venet_iov iov[1];
};

-#define VENET_FUNC_LINKUP 0
-#define VENET_FUNC_LINKDOWN 1
-#define VENET_FUNC_MACQUERY 2
-#define VENET_FUNC_NEGCAP 3 /* negotiate capabilities */
-#define VENET_FUNC_FLUSHRX 4
+#define VENET_FUNC_LINKUP 0
+#define VENET_FUNC_LINKDOWN 1
+#define VENET_FUNC_MACQUERY 2
+#define VENET_FUNC_NEGCAP 3 /* negotiate capabilities */
+#define VENET_FUNC_FLUSHRX 4
+#define VENET_FUNC_PMTDQUERY 5

#endif /* _LINUX_VENET_H */

2009-10-14 16:10:34

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 4/9] venet: report actual used descriptor size

This should reduce wasted effort copying parts of the descriptor
which are not in use, since the descriptors are typically pre-allocated
to their maximum size.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 2 ++
include/linux/venet.h | 3 +++
2 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index b3e9695..63237f3 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -582,6 +582,8 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
iov->ptr = (u64)sg_phys(sg);
}

+ iter.desc->len = (u64)VSG_DESC_SIZE(vsg->count);
+
} else {
/*
* non scatter-gather mode: simply put the skb right onto the
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 57aeddd..53b6958 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -76,6 +76,9 @@ struct venet_sg {
struct venet_iov iov[1];
};

+#define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
+ sizeof(struct venet_iov) * ((count) - 1))
+
#define VENET_FUNC_LINKUP 0
#define VENET_FUNC_LINKDOWN 1
#define VENET_FUNC_MACQUERY 2

2009-10-14 16:09:49

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 5/9] venet: cache the ringlen values at init

We want to prevent the condition where changes to the module-params
could affect the run-time validity of the ringstate

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 7 +++++--
1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 63237f3..fe9eeca 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -50,6 +50,7 @@ module_param(sg_enabled, int, 0444);
struct vbus_enet_queue {
struct ioq *queue;
struct ioq_notifier notifier;
+ unsigned long count;
};

struct vbus_enet_priv {
@@ -94,6 +95,8 @@ queue_init(struct vbus_enet_priv *priv,
q->queue->notifier = &q->notifier;
}

+ q->count = ringsize;
+
return 0;
}

@@ -222,7 +225,7 @@ tx_setup(struct vbus_enet_priv *priv)
/* pre-allocate our descriptor pool if pmtd is enabled */
if (priv->pmtd.enabled) {
struct vbus_device_proxy *dev = priv->vdev;
- size_t poollen = len * tx_ringlen;
+ size_t poollen = len * priv->txq.count;
char *pool;
int shmid;

@@ -251,7 +254,7 @@ tx_setup(struct vbus_enet_priv *priv)
/*
* Now populate each descriptor with an empty SG descriptor
*/
- for (i = 0; i < tx_ringlen; i++) {
+ for (i = 0; i < priv->txq.count; i++) {
struct venet_sg *vsg;

if (priv->pmtd.enabled) {

2009-10-14 16:09:59

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 6/9] venet: add eventq protocol

This adds an event-channel for passing host->guest messages to the
guest driver. We will use this later in the series for linkstate and
asynchronous transmit-complete events.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 203 +++++++++++++++++++++++++++++++++++++++++++++++
include/linux/venet.h | 28 ++++++
2 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index fe9eeca..5fccfd1 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -66,6 +66,14 @@ struct vbus_enet_priv {
bool enabled;
char *pool;
} pmtd; /* pre-mapped transmit descriptors */
+ struct {
+ bool enabled;
+ bool linkstate;
+ unsigned long evsize;
+ struct vbus_enet_queue veq;
+ struct tasklet_struct task;
+ char *pool;
+ } evq;
};

static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
@@ -331,6 +339,16 @@ tx_teardown(struct vbus_enet_priv *priv)
}
}

+static void
+evq_teardown(struct vbus_enet_priv *priv)
+{
+ if (!priv->evq.enabled)
+ return;
+
+ ioq_put(priv->evq.veq.queue);
+ kfree(priv->evq.pool);
+}
+
/*
* Open and close
*/
@@ -741,8 +759,91 @@ tx_isr(struct ioq_notifier *notifier)
tasklet_schedule(&priv->txtask);
}

+static void
+evq_linkstate_event(struct vbus_enet_priv *priv,
+ struct venet_event_header *header)
+{
+ struct venet_event_linkstate *event =
+ (struct venet_event_linkstate *)header;
+
+ switch (event->state) {
+ case 0:
+ netif_carrier_off(priv->dev);
+ break;
+ case 1:
+ netif_carrier_on(priv->dev);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+deferred_evq_isr(unsigned long data)
+{
+ struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
+ int nevents = 0;
+ struct ioq_iterator iter;
+ int ret;
+
+ PDEBUG(priv->dev, "evq: polling...\n");
+
+ /* We want to iterate on the head of the in-use index */
+ ret = ioq_iter_init(priv->evq.veq.queue, &iter, ioq_idxtype_inuse,
+ IOQ_ITER_AUTOUPDATE);
+ BUG_ON(ret < 0);
+
+ ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+ BUG_ON(ret < 0);
+
+ /*
+ * The EOM is indicated by finding a packet that is still owned by
+ * the south side
+ */
+ while (!iter.desc->sown) {
+ struct venet_event_header *header;
+
+ header = (struct venet_event_header *)iter.desc->cookie;
+
+ switch (header->id) {
+ case VENET_EVENT_LINKSTATE:
+ evq_linkstate_event(priv, header);
+ break;
+ default:
+ panic("venet: unexpected event id:%d of size %d\n",
+ header->id, header->size);
+ break;
+ }
+
+ memset((void *)iter.desc->cookie, 0, priv->evq.evsize);
+
+ /* Advance the in-use tail */
+ ret = ioq_iter_pop(&iter, 0);
+ BUG_ON(ret < 0);
+
+ nevents++;
+ }
+
+ PDEBUG(priv->dev, "%d events received\n", nevents);
+
+ ioq_notify_enable(priv->evq.veq.queue, 0);
+}
+
+static void
+evq_isr(struct ioq_notifier *notifier)
+{
+ struct vbus_enet_priv *priv;
+
+ priv = container_of(notifier, struct vbus_enet_priv, evq.veq.notifier);
+
+ PDEBUG(priv->dev, "evq_isr\n");
+
+ ioq_notify_disable(priv->evq.veq.queue, 0);
+ tasklet_schedule(&priv->evq.task);
+}
+
static int
-vbus_enet_negcap(struct vbus_enet_priv *priv)
+vbus_enet_sg_negcap(struct vbus_enet_priv *priv)
{
struct net_device *dev = priv->dev;
struct venet_capabilities caps;
@@ -782,6 +883,103 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
return 0;
}

+static int
+vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+ struct venet_capabilities caps;
+ int ret;
+
+ memset(&caps, 0, sizeof(caps));
+
+ caps.gid = VENET_CAP_GROUP_EVENTQ;
+ caps.bits |= VENET_CAP_EVQ_LINKSTATE;
+
+ ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+ if (ret < 0)
+ return ret;
+
+ if (caps.bits) {
+ struct vbus_device_proxy *dev = priv->vdev;
+ struct venet_eventq_query query;
+ size_t poollen;
+ struct ioq_iterator iter;
+ char *pool;
+ int i;
+
+ priv->evq.enabled = true;
+
+ if (caps.bits & VENET_CAP_EVQ_LINKSTATE) {
+ /*
+ * We will assume there is no carrier until we get
+ * an event telling us otherwise
+ */
+ netif_carrier_off(priv->dev);
+ priv->evq.linkstate = true;
+ }
+
+ memset(&query, 0, sizeof(query));
+
+ ret = devcall(priv, VENET_FUNC_EVQQUERY, &query, sizeof(query));
+ if (ret < 0)
+ return ret;
+
+ priv->evq.evsize = query.evsize;
+ poollen = query.evsize * count;
+
+ pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+ if (!pool)
+ return -ENOMEM;
+
+ priv->evq.pool = pool;
+
+ ret = dev->ops->shm(dev, query.dpid, 0,
+ pool, poollen, 0, NULL, 0);
+ if (ret < 0)
+ return ret;
+
+ queue_init(priv, &priv->evq.veq, query.qid, count, evq_isr);
+
+ ret = ioq_iter_init(priv->evq.veq.queue,
+ &iter, ioq_idxtype_valid, 0);
+ BUG_ON(ret < 0);
+
+ ret = ioq_iter_seek(&iter, ioq_seek_set, 0, 0);
+ BUG_ON(ret < 0);
+
+ /* Now populate each descriptor with an empty event */
+ for (i = 0; i < count; i++) {
+ size_t offset = (i * query.evsize);
+ void *addr = &priv->evq.pool[offset];
+
+ iter.desc->ptr = (u64)offset;
+ iter.desc->cookie = (u64)addr;
+ iter.desc->len = query.evsize;
+
+ ret = ioq_iter_push(&iter, 0);
+ BUG_ON(ret < 0);
+ }
+
+ /* Finally, enable interrupts */
+ tasklet_init(&priv->evq.task, deferred_evq_isr,
+ (unsigned long)priv);
+ ioq_notify_enable(priv->evq.veq.queue, 0);
+ }
+
+ return 0;
+}
+
+static int
+vbus_enet_negcap(struct vbus_enet_priv *priv)
+{
+ int ret;
+
+ ret = vbus_enet_sg_negcap(priv);
+ if (ret < 0)
+ return ret;
+
+ return vbus_enet_evq_negcap(priv, tx_ringlen);
+}
+
static int vbus_enet_set_tx_csum(struct net_device *dev, u32 data)
{
struct vbus_enet_priv *priv = netdev_priv(dev);
@@ -905,6 +1103,9 @@ vbus_enet_remove(struct vbus_device_proxy *vdev)
tx_teardown(priv);
ioq_put(priv->txq.queue);

+ if (priv->evq.enabled)
+ evq_teardown(priv);
+
dev->ops->close(dev, 0);

free_netdev(priv->dev);
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 53b6958..16b0156 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -37,7 +37,8 @@ struct venet_capabilities {
__u32 bits;
};

-#define VENET_CAP_GROUP_SG 0
+#define VENET_CAP_GROUP_SG 0
+#define VENET_CAP_GROUP_EVENTQ 1

/* CAPABILITIES-GROUP SG */
#define VENET_CAP_SG (1 << 0)
@@ -47,6 +48,9 @@ struct venet_capabilities {
#define VENET_CAP_UFO (1 << 4)
#define VENET_CAP_PMTD (1 << 5) /* pre-mapped tx desc */

+/* CAPABILITIES-GROUP EVENTQ */
+#define VENET_CAP_EVQ_LINKSTATE (1 << 0)
+
struct venet_iov {
__u32 len;
__u64 ptr;
@@ -76,6 +80,27 @@ struct venet_sg {
struct venet_iov iov[1];
};

+struct venet_eventq_query {
+ __u32 flags;
+ __u32 evsize; /* size of each event */
+ __u32 dpid; /* descriptor pool-id */
+ __u32 qid;
+ __u8 pad[16];
+};
+
+#define VENET_EVENT_LINKSTATE 0
+
+struct venet_event_header {
+ __u32 flags;
+ __u32 size;
+ __u32 id;
+};
+
+struct venet_event_linkstate {
+ struct venet_event_header header;
+ __u8 state; /* 0 = down, 1 = up */
+};
+
#define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
sizeof(struct venet_iov) * ((count) - 1))

@@ -85,5 +110,6 @@ struct venet_sg {
#define VENET_FUNC_NEGCAP 3 /* negotiate capabilities */
#define VENET_FUNC_FLUSHRX 4
#define VENET_FUNC_PMTDQUERY 5
+#define VENET_FUNC_EVQQUERY 6

#endif /* _LINUX_VENET_H */

2009-10-14 16:09:52

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 7/9] venet: use an skblist for outstanding descriptors

This will be useful later in the series so that we can switch to
an asynchronous model.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 59 +++++++++++++++++++++++++++--------------------
1 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 5fccfd1..3032169 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -59,8 +59,11 @@ struct vbus_enet_priv {
struct vbus_device_proxy *vdev;
struct napi_struct napi;
struct vbus_enet_queue rxq;
- struct vbus_enet_queue txq;
- struct tasklet_struct txtask;
+ struct {
+ struct vbus_enet_queue veq;
+ struct tasklet_struct task;
+ struct sk_buff_head outstanding;
+ } tx;
bool sg;
struct {
bool enabled;
@@ -76,7 +79,7 @@ struct vbus_enet_priv {
} evq;
};

-static void vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force);
+static void vbus_enet_tx_reap(struct vbus_enet_priv *priv);

static struct vbus_enet_priv *
napi_to_priv(struct napi_struct *napi)
@@ -216,7 +219,7 @@ rx_teardown(struct vbus_enet_priv *priv)
static int
tx_setup(struct vbus_enet_priv *priv)
{
- struct ioq *ioq = priv->txq.queue;
+ struct ioq *ioq = priv->tx.veq.queue;
size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
size_t len = sizeof(struct venet_sg) + iovlen;
struct ioq_iterator iter;
@@ -233,7 +236,7 @@ tx_setup(struct vbus_enet_priv *priv)
/* pre-allocate our descriptor pool if pmtd is enabled */
if (priv->pmtd.enabled) {
struct vbus_device_proxy *dev = priv->vdev;
- size_t poollen = len * priv->txq.count;
+ size_t poollen = len * priv->tx.veq.count;
char *pool;
int shmid;

@@ -262,7 +265,7 @@ tx_setup(struct vbus_enet_priv *priv)
/*
* Now populate each descriptor with an empty SG descriptor
*/
- for (i = 0; i < priv->txq.count; i++) {
+ for (i = 0; i < priv->tx.veq.count; i++) {
struct venet_sg *vsg;

if (priv->pmtd.enabled) {
@@ -291,12 +294,14 @@ tx_setup(struct vbus_enet_priv *priv)
static void
tx_teardown(struct vbus_enet_priv *priv)
{
- struct ioq *ioq = priv->txq.queue;
+ struct ioq *ioq = priv->tx.veq.queue;
struct ioq_iterator iter;
+ struct sk_buff *skb;
int ret;

/* forcefully free all outstanding transmissions */
- vbus_enet_tx_reap(priv, 1);
+ while ((skb = __skb_dequeue(&priv->tx.outstanding)))
+ dev_kfree_skb(skb);

if (!priv->sg)
/*
@@ -529,7 +534,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)

spin_lock_irqsave(&priv->lock, flags);

- if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+ if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
/*
* We must flow-control the kernel by disabling the
* queue
@@ -544,7 +549,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
* We want to iterate on the tail of both the "inuse" and "valid" index
* so we specify the "both" index
*/
- ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_both,
+ ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_both,
IOQ_ITER_AUTOUPDATE);
BUG_ON(ret < 0);

@@ -620,6 +625,8 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
priv->dev->stats.tx_packets++;
priv->dev->stats.tx_bytes += skb->len;

+ __skb_queue_tail(&priv->tx.outstanding, skb);
+
/*
* This advances both indexes together implicitly, and then
* signals the south side to consume the packet
@@ -629,7 +636,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)

dev->trans_start = jiffies; /* save the timestamp */

- if (ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+ if (ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
/*
* If the queue is congested, we must flow-control the kernel
*/
@@ -648,7 +655,7 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
* assumes priv->lock held
*/
static void
-vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
+vbus_enet_tx_reap(struct vbus_enet_priv *priv)
{
struct ioq_iterator iter;
int ret;
@@ -658,7 +665,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
* do not want the iter_pop (below) to flip the ownership, so
* we set the NOFLIPOWNER option
*/
- ret = ioq_iter_init(priv->txq.queue, &iter, ioq_idxtype_valid,
+ ret = ioq_iter_init(priv->tx.veq.queue, &iter, ioq_idxtype_valid,
IOQ_ITER_NOFLIPOWNER);
BUG_ON(ret < 0);

@@ -669,7 +676,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
* We are done once we find the first packet either invalid or still
* owned by the south-side
*/
- while (iter.desc->valid && (!iter.desc->sown || force)) {
+ while (iter.desc->valid && !iter.desc->sown) {
struct sk_buff *skb;

if (priv->sg) {
@@ -687,6 +694,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
/* Reset the descriptor */
iter.desc->valid = 0;

+ __skb_unlink(skb, &priv->tx.outstanding);
dev_kfree_skb(skb);

/* Advance the valid-index head */
@@ -699,7 +707,7 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv, int force)
* processing
*/
if (netif_queue_stopped(priv->dev)
- && !ioq_full(priv->txq.queue, ioq_idxtype_valid)) {
+ && !ioq_full(priv->tx.veq.queue, ioq_idxtype_valid)) {
PDEBUG(priv->dev, "re-enabling tx queue\n");
netif_wake_queue(priv->dev);
}
@@ -714,7 +722,7 @@ vbus_enet_timeout(struct net_device *dev)
dev_dbg(&dev->dev, "Transmit timeout\n");

spin_lock_irqsave(&priv->lock, flags);
- vbus_enet_tx_reap(priv, 0);
+ vbus_enet_tx_reap(priv);
spin_unlock_irqrestore(&priv->lock, flags);
}

@@ -740,10 +748,10 @@ deferred_tx_isr(unsigned long data)
PDEBUG(priv->dev, "deferred_tx_isr\n");

spin_lock_irqsave(&priv->lock, flags);
- vbus_enet_tx_reap(priv, 0);
+ vbus_enet_tx_reap(priv);
spin_unlock_irqrestore(&priv->lock, flags);

- ioq_notify_enable(priv->txq.queue, 0);
+ ioq_notify_enable(priv->tx.veq.queue, 0);
}

static void
@@ -751,12 +759,12 @@ tx_isr(struct ioq_notifier *notifier)
{
struct vbus_enet_priv *priv;

- priv = container_of(notifier, struct vbus_enet_priv, txq.notifier);
+ priv = container_of(notifier, struct vbus_enet_priv, tx.veq.notifier);

PDEBUG(priv->dev, "tx_isr\n");

- ioq_notify_disable(priv->txq.queue, 0);
- tasklet_schedule(&priv->txtask);
+ ioq_notify_disable(priv->tx.veq.queue, 0);
+ tasklet_schedule(&priv->tx.task);
}

static void
@@ -1043,16 +1051,17 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
goto out_free;
}

- tasklet_init(&priv->txtask, deferred_tx_isr, (unsigned long)priv);
+ tasklet_init(&priv->tx.task, deferred_tx_isr, (unsigned long)priv);
+ skb_queue_head_init(&priv->tx.outstanding);

queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
- queue_init(priv, &priv->txq, VENET_QUEUE_TX, tx_ringlen, tx_isr);
+ queue_init(priv, &priv->tx.veq, VENET_QUEUE_TX, tx_ringlen, tx_isr);

rx_setup(priv);
tx_setup(priv);

ioq_notify_enable(priv->rxq.queue, 0); /* enable interrupts */
- ioq_notify_enable(priv->txq.queue, 0);
+ ioq_notify_enable(priv->tx.veq.queue, 0);

dev->netdev_ops = &vbus_enet_netdev_ops;
dev->watchdog_timeo = 5 * HZ;
@@ -1101,7 +1110,7 @@ vbus_enet_remove(struct vbus_device_proxy *vdev)
ioq_put(priv->rxq.queue);

tx_teardown(priv);
- ioq_put(priv->txq.queue);
+ ioq_put(priv->tx.veq.queue);

if (priv->evq.enabled)
evq_teardown(priv);

2009-10-14 16:09:58

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 8/9] venet: add a tx-complete event for out-of-order support

This paves the way for zero-copy support since we cannot predict
the order in which paged-skbs may actually be consumed.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 77 ++++++++++++++++++++++++++++++++++++++---------
include/linux/venet.h | 8 +++++
2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index 3032169..e8a0553 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -72,6 +72,7 @@ struct vbus_enet_priv {
struct {
bool enabled;
bool linkstate;
+ bool txc;
unsigned long evsize;
struct vbus_enet_queue veq;
struct tasklet_struct task;
@@ -649,6 +650,17 @@ vbus_enet_tx_start(struct sk_buff *skb, struct net_device *dev)
return 0;
}

+/* assumes priv->lock held */
+static void
+vbus_enet_skb_complete(struct vbus_enet_priv *priv, struct sk_buff *skb)
+{
+ PDEBUG(priv->dev, "completed sending %d bytes\n",
+ skb->len);
+
+ __skb_unlink(skb, &priv->tx.outstanding);
+ dev_kfree_skb(skb);
+}
+
/*
* reclaim any outstanding completed tx packets
*
@@ -677,26 +689,28 @@ vbus_enet_tx_reap(struct vbus_enet_priv *priv)
* owned by the south-side
*/
while (iter.desc->valid && !iter.desc->sown) {
- struct sk_buff *skb;

- if (priv->sg) {
- struct venet_sg *vsg;
+ if (!priv->evq.txc) {
+ struct sk_buff *skb;

- vsg = (struct venet_sg *)iter.desc->cookie;
- skb = (struct sk_buff *)vsg->cookie;
+ if (priv->sg) {
+ struct venet_sg *vsg;

- } else {
- skb = (struct sk_buff *)iter.desc->cookie;
- }
+ vsg = (struct venet_sg *)iter.desc->cookie;
+ skb = (struct sk_buff *)vsg->cookie;
+ } else
+ skb = (struct sk_buff *)iter.desc->cookie;

- PDEBUG(priv->dev, "completed sending %d bytes\n", skb->len);
+ /*
+ * If TXC is not enabled, we are required to free
+ * the buffer resources now
+ */
+ vbus_enet_skb_complete(priv, skb);
+ }

/* Reset the descriptor */
iter.desc->valid = 0;

- __skb_unlink(skb, &priv->tx.outstanding);
- dev_kfree_skb(skb);
-
/* Advance the valid-index head */
ret = ioq_iter_pop(&iter, 0);
BUG_ON(ret < 0);
@@ -787,6 +801,22 @@ evq_linkstate_event(struct vbus_enet_priv *priv,
}

static void
+evq_txc_event(struct vbus_enet_priv *priv,
+ struct venet_event_header *header)
+{
+ struct venet_event_txc *event =
+ (struct venet_event_txc *)header;
+ unsigned long flags;
+
+ spin_lock_irqsave(&priv->lock, flags);
+
+ vbus_enet_tx_reap(priv);
+ vbus_enet_skb_complete(priv, (struct sk_buff *)event->cookie);
+
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void
deferred_evq_isr(unsigned long data)
{
struct vbus_enet_priv *priv = (struct vbus_enet_priv *)data;
@@ -817,6 +847,9 @@ deferred_evq_isr(unsigned long data)
case VENET_EVENT_LINKSTATE:
evq_linkstate_event(priv, header);
break;
+ case VENET_EVENT_TXC:
+ evq_txc_event(priv, header);
+ break;
default:
panic("venet: unexpected event id:%d of size %d\n",
header->id, header->size);
@@ -901,6 +934,7 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)

caps.gid = VENET_CAP_GROUP_EVENTQ;
caps.bits |= VENET_CAP_EVQ_LINKSTATE;
+ caps.bits |= VENET_CAP_EVQ_TXC;

ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
if (ret < 0)
@@ -925,6 +959,9 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
priv->evq.linkstate = true;
}

+ if (caps.bits & VENET_CAP_EVQ_TXC)
+ priv->evq.txc = true;
+
memset(&query, 0, sizeof(query));

ret = devcall(priv, VENET_FUNC_EVQQUERY, &query, sizeof(query));
@@ -1051,7 +1088,6 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
goto out_free;
}

- tasklet_init(&priv->tx.task, deferred_tx_isr, (unsigned long)priv);
skb_queue_head_init(&priv->tx.outstanding);

queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
@@ -1060,8 +1096,19 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
rx_setup(priv);
tx_setup(priv);

- ioq_notify_enable(priv->rxq.queue, 0); /* enable interrupts */
- ioq_notify_enable(priv->tx.veq.queue, 0);
+ ioq_notify_enable(priv->rxq.queue, 0); /* enable rx interrupts */
+
+ if (!priv->evq.txc) {
+ /*
+ * If the TXC feature is present, we will recieve our
+ * tx-complete notification via the event-channel. Therefore,
+ * we only enable txq interrupts if the TXC feature is not
+ * present.
+ */
+ tasklet_init(&priv->tx.task, deferred_tx_isr,
+ (unsigned long)priv);
+ ioq_notify_enable(priv->tx.veq.queue, 0);
+ }

dev->netdev_ops = &vbus_enet_netdev_ops;
dev->watchdog_timeo = 5 * HZ;
diff --git a/include/linux/venet.h b/include/linux/venet.h
index 16b0156..b6bfd91 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -50,6 +50,7 @@ struct venet_capabilities {

/* CAPABILITIES-GROUP EVENTQ */
#define VENET_CAP_EVQ_LINKSTATE (1 << 0)
+#define VENET_CAP_EVQ_TXC (1 << 1) /* tx-complete */

struct venet_iov {
__u32 len;
@@ -89,6 +90,7 @@ struct venet_eventq_query {
};

#define VENET_EVENT_LINKSTATE 0
+#define VENET_EVENT_TXC 1

struct venet_event_header {
__u32 flags;
@@ -101,6 +103,12 @@ struct venet_event_linkstate {
__u8 state; /* 0 = down, 1 = up */
};

+struct venet_event_txc {
+ struct venet_event_header header;
+ __u32 txqid;
+ __u64 cookie;
+};
+
#define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
sizeof(struct venet_iov) * ((count) - 1))

2009-10-14 16:10:47

by Gregory Haskins

[permalink] [raw]
Subject: [NET PATCH 9/9] venet: add Layer-4 Reassembler Offload (L4RO) support

This is the converse to GSO. It lets us receive fully reassembled L4
frames from the host. This allows us to reduce the interrupt rate of
the guest, take advantage of host-based hardware that does reassembly,
and to skip the SAR overhead for localhost (host->guest, guest->guest)
connectivity.

We accomplish this by re-using the SG support from the transmit/GSO side
and supplying a "page-queue" of free pages to use for when we need
frames larger than MTU.

Signed-off-by: Gregory Haskins <[email protected]>
---

drivers/net/vbus-enet.c | 384 +++++++++++++++++++++++++++++++++++++++++++----
include/linux/venet.h | 10 +
2 files changed, 365 insertions(+), 29 deletions(-)

diff --git a/drivers/net/vbus-enet.c b/drivers/net/vbus-enet.c
index e8a0553..6fe2241 100644
--- a/drivers/net/vbus-enet.c
+++ b/drivers/net/vbus-enet.c
@@ -47,6 +47,8 @@ module_param(sg_enabled, int, 0444);

#define PDEBUG(_dev, fmt, args...) dev_dbg(&(_dev)->dev, fmt, ## args)

+#define SG_DESC_SIZE VSG_DESC_SIZE(MAX_SKB_FRAGS)
+
struct vbus_enet_queue {
struct ioq *queue;
struct ioq_notifier notifier;
@@ -78,6 +80,14 @@ struct vbus_enet_priv {
struct tasklet_struct task;
char *pool;
} evq;
+ struct {
+ bool available;
+ char *pool;
+ struct vbus_enet_queue pageq;
+ } l4ro;
+
+ struct sk_buff *(*import)(struct vbus_enet_priv *priv,
+ struct ioq_ring_desc *desc);
};

static void vbus_enet_tx_reap(struct vbus_enet_priv *priv);
@@ -127,29 +137,88 @@ devcall(struct vbus_enet_priv *priv, u32 func, void *data, size_t len)
*/

static void
-rxdesc_alloc(struct net_device *dev, struct ioq_ring_desc *desc, size_t len)
+rxdesc_alloc(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc, size_t len)
{
+ struct net_device *dev = priv->dev;
struct sk_buff *skb;

len += ETH_HLEN;

- skb = netdev_alloc_skb(dev, len + 2);
+ skb = netdev_alloc_skb(dev, len + NET_IP_ALIGN);
BUG_ON(!skb);

skb_reserve(skb, NET_IP_ALIGN); /* align IP on 16B boundary */

- desc->cookie = (u64)skb;
- desc->ptr = (u64)__pa(skb->data);
- desc->len = len; /* total length */
+ if (priv->l4ro.available) {
+ /*
+ * We will populate an SG descriptor initially with one
+ * IOV filled with an MTU SKB. If the packet needs to be
+ * larger than MTU, the host will grab pages out of the
+ * page-queue and populate additional IOVs
+ */
+ struct venet_sg *vsg = (struct venet_sg *)desc->cookie;
+ struct venet_iov *iov = &vsg->iov[0];
+
+ memset(vsg, 0, SG_DESC_SIZE);
+
+ vsg->cookie = (u64)skb;
+ vsg->count = 1;
+
+ iov->ptr = (u64)__pa(skb->data);
+ iov->len = len;
+ } else {
+ desc->cookie = (u64)skb;
+ desc->ptr = (u64)__pa(skb->data);
+ desc->len = len; /* total length */
+ }
+
desc->valid = 1;
}

static void
+rx_pageq_refill(struct vbus_enet_priv *priv)
+{
+ struct ioq *ioq = priv->l4ro.pageq.queue;
+ struct ioq_iterator iter;
+ int ret;
+
+ if (ioq_full(ioq, ioq_idxtype_inuse))
+ /* nothing to do if the pageq is already fully populated */
+ return;
+
+ ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+ BUG_ON(ret < 0); /* will never fail unless seriously broken */
+
+ ret = ioq_iter_seek(&iter, ioq_seek_tail, 0, 0);
+ BUG_ON(ret < 0);
+
+ /*
+ * Now populate each descriptor with an empty page
+ */
+ while (!iter.desc->sown) {
+ struct page *page;
+
+ page = alloc_page(GFP_KERNEL);
+ BUG_ON(!page);
+
+ iter.desc->cookie = (u64)page;
+ iter.desc->ptr = (u64)__pa(page_address(page));
+ iter.desc->len = PAGE_SIZE;
+
+ ret = ioq_iter_push(&iter, 0);
+ BUG_ON(ret < 0);
+ }
+
+ ioq_signal(ioq, 0);
+}
+
+static void
rx_setup(struct vbus_enet_priv *priv)
{
struct ioq *ioq = priv->rxq.queue;
struct ioq_iterator iter;
int ret;
+ int i = 0;

/*
* We want to iterate on the "valid" index. By default the iterator
@@ -170,10 +239,19 @@ rx_setup(struct vbus_enet_priv *priv)
BUG_ON(ret < 0);

/*
- * Now populate each descriptor with an empty SKB and mark it valid
+ * Now populate each descriptor with an empty buffer and mark it valid
*/
while (!iter.desc->valid) {
- rxdesc_alloc(priv->dev, iter.desc, priv->dev->mtu);
+ if (priv->l4ro.available) {
+ size_t offset = (i * SG_DESC_SIZE);
+ void *addr = &priv->l4ro.pool[offset];
+
+ iter.desc->ptr = (u64)offset;
+ iter.desc->cookie = (u64)addr;
+ iter.desc->len = SG_DESC_SIZE;
+ }
+
+ rxdesc_alloc(priv, iter.desc, priv->dev->mtu);

/*
* This push operation will simultaneously advance the
@@ -182,11 +260,16 @@ rx_setup(struct vbus_enet_priv *priv)
*/
ret = ioq_iter_push(&iter, 0);
BUG_ON(ret < 0);
+
+ i++;
}
+
+ if (priv->l4ro.available)
+ rx_pageq_refill(priv);
}

static void
-rx_teardown(struct vbus_enet_priv *priv)
+rx_rxq_teardown(struct vbus_enet_priv *priv)
{
struct ioq *ioq = priv->rxq.queue;
struct ioq_iterator iter;
@@ -202,7 +285,25 @@ rx_teardown(struct vbus_enet_priv *priv)
* free each valid descriptor
*/
while (iter.desc->valid) {
- struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
+ struct sk_buff *skb;
+
+ if (priv->l4ro.available) {
+ struct venet_sg *vsg;
+ int i;
+
+ vsg = (struct venet_sg *)iter.desc->cookie;
+
+ /* skip i=0, since that is the skb->data IOV */
+ for (i = 1; i < vsg->count; i++) {
+ struct venet_iov *iov = &vsg->iov[i];
+ struct page *page = (struct page *)iov->ptr;
+
+ put_page(page);
+ }
+
+ skb = (struct sk_buff *)vsg->cookie;
+ } else
+ skb = (struct sk_buff *)iter.desc->cookie;

iter.desc->valid = 0;
wmb();
@@ -217,12 +318,54 @@ rx_teardown(struct vbus_enet_priv *priv)
}
}

+static void
+rx_l4ro_teardown(struct vbus_enet_priv *priv)
+{
+ struct ioq *ioq = priv->l4ro.pageq.queue;
+ struct ioq_iterator iter;
+ int ret;
+
+ ret = ioq_iter_init(ioq, &iter, ioq_idxtype_inuse, 0);
+ BUG_ON(ret < 0);
+
+ ret = ioq_iter_seek(&iter, ioq_seek_head, 0, 0);
+ BUG_ON(ret < 0);
+
+ /*
+ * free each valid descriptor
+ */
+ while (iter.desc->sown) {
+ struct page *page = (struct page *)iter.desc->cookie;
+
+ iter.desc->valid = 0;
+ wmb();
+
+ iter.desc->ptr = 0;
+ iter.desc->cookie = 0;
+
+ ret = ioq_iter_pop(&iter, 0);
+ BUG_ON(ret < 0);
+
+ put_page(page);
+ }
+
+ ioq_put(ioq);
+ kfree(priv->l4ro.pool);
+}
+
+static void
+rx_teardown(struct vbus_enet_priv *priv)
+{
+ rx_rxq_teardown(priv);
+
+ if (priv->l4ro.available)
+ rx_l4ro_teardown(priv);
+}
+
static int
tx_setup(struct vbus_enet_priv *priv)
{
struct ioq *ioq = priv->tx.veq.queue;
- size_t iovlen = sizeof(struct venet_iov) * (MAX_SKB_FRAGS-1);
- size_t len = sizeof(struct venet_sg) + iovlen;
struct ioq_iterator iter;
int i;
int ret;
@@ -237,7 +380,7 @@ tx_setup(struct vbus_enet_priv *priv)
/* pre-allocate our descriptor pool if pmtd is enabled */
if (priv->pmtd.enabled) {
struct vbus_device_proxy *dev = priv->vdev;
- size_t poollen = len * priv->tx.veq.count;
+ size_t poollen = SG_DESC_SIZE * priv->tx.veq.count;
char *pool;
int shmid;

@@ -270,12 +413,12 @@ tx_setup(struct vbus_enet_priv *priv)
struct venet_sg *vsg;

if (priv->pmtd.enabled) {
- size_t offset = (i * len);
+ size_t offset = (i * SG_DESC_SIZE);

vsg = (struct venet_sg *)&priv->pmtd.pool[offset];
iter.desc->ptr = (u64)offset;
} else {
- vsg = kzalloc(len, GFP_KERNEL);
+ vsg = kzalloc(SG_DESC_SIZE, GFP_KERNEL);
if (!vsg)
return -ENOMEM;

@@ -283,7 +426,7 @@ tx_setup(struct vbus_enet_priv *priv)
}

iter.desc->cookie = (u64)vsg;
- iter.desc->len = len;
+ iter.desc->len = SG_DESC_SIZE;

ret = ioq_iter_seek(&iter, ioq_seek_next, 0, 0);
BUG_ON(ret < 0);
@@ -444,6 +587,120 @@ vbus_enet_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}

+static struct sk_buff *
+vbus_enet_l4ro_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+ struct venet_sg *vsg = (struct venet_sg *)desc->cookie;
+ struct sk_buff *skb = (struct sk_buff *)vsg->cookie;
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ int i;
+
+ rx_pageq_refill(priv);
+
+ if (!vsg->len)
+ /*
+ * the device may send a zero-length packet when its
+ * flushing references on the ring. We can just drop
+ * these on the floor
+ */
+ goto fail;
+
+ /* advance only by the linear portion in IOV[0] */
+ skb_put(skb, vsg->iov[0].len);
+
+ /* skip i=0, since that is the skb->data IOV */
+ for (i = 1; i < vsg->count; i++) {
+ struct venet_iov *iov = &vsg->iov[i];
+ struct page *page = (struct page *)iov->ptr;
+ skb_frag_t *f = &sinfo->frags[i-1];
+
+ f->page = page;
+ f->page_offset = 0;
+ f->size = iov->len;
+
+ PDEBUG(priv->dev, "SG: Importing %d byte page[%i]\n",
+ f->size, i);
+
+ skb->data_len += f->size;
+ skb->len += f->size;
+ skb->truesize += f->size;
+ sinfo->nr_frags++;
+ }
+
+ if (vsg->flags & VENET_SG_FLAG_NEEDS_CSUM
+ && !skb_partial_csum_set(skb, vsg->csum.start,
+ vsg->csum.offset)) {
+ priv->dev->stats.rx_frame_errors++;
+ goto fail;
+ }
+
+ if (vsg->flags & VENET_SG_FLAG_GSO) {
+ PDEBUG(priv->dev, "L4RO packet detected\n");
+
+ switch (vsg->gso.type) {
+ case VENET_GSO_TYPE_TCPV4:
+ sinfo->gso_type = SKB_GSO_TCPV4;
+ break;
+ case VENET_GSO_TYPE_TCPV6:
+ sinfo->gso_type = SKB_GSO_TCPV6;
+ break;
+ case VENET_GSO_TYPE_UDP:
+ sinfo->gso_type = SKB_GSO_UDP;
+ break;
+ default:
+ PDEBUG(priv->dev, "Illegal L4RO type: %d\n",
+ vsg->gso.type);
+ priv->dev->stats.rx_frame_errors++;
+ goto fail;
+ }
+
+ if (vsg->flags & VENET_SG_FLAG_ECN)
+ sinfo->gso_type |= SKB_GSO_TCP_ECN;
+
+ sinfo->gso_size = vsg->gso.size;
+ if (sinfo->gso_size == 0) {
+ PDEBUG(priv->dev, "Illegal L4RO size: %d\n",
+ vsg->gso.size);
+ priv->dev->stats.rx_frame_errors++;
+ goto fail;
+ }
+
+ /*
+ * Header must be checked, and gso_segs
+ * computed.
+ */
+ sinfo->gso_type |= SKB_GSO_DODGY;
+ sinfo->gso_segs = 0;
+ }
+
+ return skb;
+
+fail:
+ dev_kfree_skb(skb);
+
+ return NULL;
+}
+
+static struct sk_buff *
+vbus_enet_flat_import(struct vbus_enet_priv *priv, struct ioq_ring_desc *desc)
+{
+ struct sk_buff *skb = (struct sk_buff *)desc->cookie;
+
+ if (!desc->len) {
+ /*
+ * the device may send a zero-length packet when its
+ * flushing references on the ring. We can just drop
+ * these on the floor
+ */
+ dev_kfree_skb(skb);
+ return NULL;
+ }
+
+ skb_put(skb, desc->len);
+
+ return skb;
+}
+
/*
* The poll implementation.
*/
@@ -471,15 +728,14 @@ vbus_enet_poll(struct napi_struct *napi, int budget)
* the south side
*/
while ((npackets < budget) && (!iter.desc->sown)) {
- struct sk_buff *skb = (struct sk_buff *)iter.desc->cookie;
-
- if (iter.desc->len) {
- skb_put(skb, iter.desc->len);
+ struct sk_buff *skb;

+ skb = priv->import(priv, iter.desc);
+ if (skb) {
/* Maintain stats */
npackets++;
priv->dev->stats.rx_packets++;
- priv->dev->stats.rx_bytes += iter.desc->len;
+ priv->dev->stats.rx_bytes += skb->len;

/* Pass the buffer up to the stack */
skb->dev = priv->dev;
@@ -487,16 +743,10 @@ vbus_enet_poll(struct napi_struct *napi, int budget)
netif_receive_skb(skb);

mb();
- } else
- /*
- * the device may send a zero-length packet when its
- * flushing references on the ring. We can just drop
- * these on the floor
- */
- dev_kfree_skb(skb);
+ }

/* Grab a new buffer to put in the ring */
- rxdesc_alloc(priv->dev, iter.desc, priv->dev->mtu);
+ rxdesc_alloc(priv, iter.desc, priv->dev->mtu);

/* Advance the in-use tail */
ret = ioq_iter_pop(&iter, 0);
@@ -1014,6 +1264,69 @@ vbus_enet_evq_negcap(struct vbus_enet_priv *priv, unsigned long count)
}

static int
+vbus_enet_l4ro_negcap(struct vbus_enet_priv *priv, unsigned long count)
+{
+ struct venet_capabilities caps;
+ int ret;
+
+ memset(&caps, 0, sizeof(caps));
+
+ caps.gid = VENET_CAP_GROUP_L4RO;
+ caps.bits |= (VENET_CAP_SG|VENET_CAP_TSO4|VENET_CAP_TSO6
+ |VENET_CAP_ECN);
+
+ ret = devcall(priv, VENET_FUNC_NEGCAP, &caps, sizeof(caps));
+ if (ret < 0) {
+ printk(KERN_ERR "Error negotiating L4RO: %d\n", ret);
+ return ret;
+ }
+
+ if (caps.bits & VENET_CAP_SG) {
+ struct vbus_device_proxy *dev = priv->vdev;
+ size_t poollen = SG_DESC_SIZE * count;
+ struct venet_l4ro_query query;
+ char *pool;
+
+ memset(&query, 0, sizeof(query));
+
+ ret = devcall(priv, VENET_FUNC_L4ROQUERY, &query, sizeof(query));
+ if (ret < 0) {
+ printk(KERN_ERR "Error querying L4RO: %d\n", ret);
+ return ret;
+ }
+
+ pool = kzalloc(poollen, GFP_KERNEL | GFP_DMA);
+ if (!pool)
+ return -ENOMEM;
+
+ /*
+ * pre-mapped descriptor pool
+ */
+ ret = dev->ops->shm(dev, query.dpid, 0,
+ pool, poollen, 0, NULL, 0);
+ if (ret < 0) {
+ printk(KERN_ERR "Error registering L4RO pool: %d\n",
+ ret);
+ kfree(pool);
+ return ret;
+ }
+
+ /*
+ * page-queue: contains a ring of arbitrary pages for
+ * consumption by the host for when the SG::IOV count exceeds
+ * one MTU frame. All we need to do is keep it populated
+ * with free pages.
+ */
+ queue_init(priv, &priv->l4ro.pageq, query.pqid, count, NULL);
+
+ priv->l4ro.pool = pool;
+ priv->l4ro.available = true;
+ }
+
+ return 0;
+}
+
+static int
vbus_enet_negcap(struct vbus_enet_priv *priv)
{
int ret;
@@ -1022,7 +1335,15 @@ vbus_enet_negcap(struct vbus_enet_priv *priv)
if (ret < 0)
return ret;

- return vbus_enet_evq_negcap(priv, tx_ringlen);
+ ret = vbus_enet_evq_negcap(priv, tx_ringlen);
+ if (ret < 0)
+ return ret;
+
+ ret = vbus_enet_l4ro_negcap(priv, rx_ringlen);
+ if (ret < 0)
+ return ret;
+
+ return 0;
}

static int vbus_enet_set_tx_csum(struct net_device *dev, u32 data)
@@ -1088,6 +1409,11 @@ vbus_enet_probe(struct vbus_device_proxy *vdev)
goto out_free;
}

+ if (priv->l4ro.available)
+ priv->import = &vbus_enet_l4ro_import;
+ else
+ priv->import = &vbus_enet_flat_import;
+
skb_queue_head_init(&priv->tx.outstanding);

queue_init(priv, &priv->rxq, VENET_QUEUE_RX, rx_ringlen, rx_isr);
diff --git a/include/linux/venet.h b/include/linux/venet.h
index b6bfd91..0578d79 100644
--- a/include/linux/venet.h
+++ b/include/linux/venet.h
@@ -39,6 +39,7 @@ struct venet_capabilities {

#define VENET_CAP_GROUP_SG 0
#define VENET_CAP_GROUP_EVENTQ 1
+#define VENET_CAP_GROUP_L4RO 2 /* layer-4 reassem offloading */

/* CAPABILITIES-GROUP SG */
#define VENET_CAP_SG (1 << 0)
@@ -109,6 +110,14 @@ struct venet_event_txc {
__u64 cookie;
};

+struct venet_l4ro_query {
+ __u32 flags;
+ __u32 dpid; /* descriptor pool-id */
+ __u32 pqid; /* page queue-id */
+ __u8 pad[20];
+};
+
+
#define VSG_DESC_SIZE(count) (sizeof(struct venet_sg) + \
sizeof(struct venet_iov) * ((count) - 1))

@@ -119,5 +128,6 @@ struct venet_event_txc {
#define VENET_FUNC_FLUSHRX 4
#define VENET_FUNC_PMTDQUERY 5
#define VENET_FUNC_EVQQUERY 6
+#define VENET_FUNC_L4ROQUERY 7

#endif /* _LINUX_VENET_H */