The original skb management for netpoll was a mess, it had two queue paths
and a callback. This changes it to have a per-instance transmit queue
and use a tasklet rather than a work queue for the congested case.
Signed-off-by: Stephen Hemminger <[email protected]>
---
drivers/net/netconsole.c | 1
include/linux/netpoll.h | 3 -
net/core/netpoll.c | 123 +++++++++++++++--------------------------------
3 files changed, 42 insertions(+), 85 deletions(-)
--- linux-2.6.orig/drivers/net/netconsole.c 2006-10-16 14:15:01.000000000 -0700
+++ linux-2.6/drivers/net/netconsole.c 2006-10-19 08:28:25.000000000 -0700
@@ -60,7 +60,6 @@
.local_port = 6665,
.remote_port = 6666,
.remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
- .drop = netpoll_queue,
};
static int configured = 0;
--- linux-2.6.orig/include/linux/netpoll.h 2006-10-16 14:15:04.000000000 -0700
+++ linux-2.6/include/linux/netpoll.h 2006-10-19 08:28:25.000000000 -0700
@@ -27,11 +27,12 @@
struct netpoll_info {
spinlock_t poll_lock;
int poll_owner;
- int tries;
int rx_flags;
spinlock_t rx_lock;
struct netpoll *rx_np; /* netpoll that registered an rx_hook */
struct sk_buff_head arp_tx; /* list of arp requests to reply to */
+ struct sk_buff_head tx_q;
+ struct tasklet_struct tx_task;
};
void netpoll_poll(struct netpoll *np);
--- linux-2.6.orig/net/core/netpoll.c 2006-10-19 08:28:04.000000000 -0700
+++ linux-2.6/net/core/netpoll.c 2006-10-19 09:47:38.000000000 -0700
@@ -40,10 +40,6 @@
static int nr_skbs;
static struct sk_buff *skbs;
-static DEFINE_SPINLOCK(queue_lock);
-static int queue_depth;
-static struct sk_buff *queue_head, *queue_tail;
-
static atomic_t trapped;
#define NETPOLL_RX_ENABLED 1
@@ -56,49 +52,33 @@
static void zap_completion_queue(void);
static void arp_reply(struct sk_buff *skb);
-static void queue_process(void *p)
+static void netpoll_run(unsigned long arg)
{
- unsigned long flags;
+ struct net_device *dev = (struct net_device *) arg;
+ struct netpoll_info *npinfo = dev->npinfo;
struct sk_buff *skb;
- while (queue_head) {
- spin_lock_irqsave(&queue_lock, flags);
-
- skb = queue_head;
- queue_head = skb->next;
- if (skb == queue_tail)
- queue_head = NULL;
-
- queue_depth--;
-
- spin_unlock_irqrestore(&queue_lock, flags);
+ while ((skb = skb_dequeue(&npinfo->tx_q))) {
+ int rc;
- dev_queue_xmit(skb);
- }
-}
-
-static DECLARE_WORK(send_queue, queue_process, NULL);
+ if (!netif_running(dev) || !netif_device_present(dev)) {
+ __kfree_skb(skb);
+ continue;
+ }
-void netpoll_queue(struct sk_buff *skb)
-{
- unsigned long flags;
+ netif_tx_lock(dev);
+ if (netif_queue_stopped(dev))
+ rc = NETDEV_TX_BUSY;
+ else
+ rc = dev->hard_start_xmit(skb, dev);
+ netif_tx_unlock(dev);
- if (queue_depth == MAX_QUEUE_DEPTH) {
- __kfree_skb(skb);
- return;
+ if (rc != NETDEV_TX_OK) {
+ skb_queue_head(&npinfo->tx_q, skb);
+ tasklet_schedule(&npinfo->tx_task);
+ break;
+ }
}
- WARN_ON(skb->protocol == 0);
-
- spin_lock_irqsave(&queue_lock, flags);
- if (!queue_head)
- queue_head = skb;
- else
- queue_tail->next = skb;
- queue_tail = skb;
- queue_depth++;
- spin_unlock_irqrestore(&queue_lock, flags);
-
- schedule_work(&send_queue);
}
static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
@@ -232,6 +212,7 @@
static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
{
+ int once = 1, count = 0;
unsigned long flags;
struct sk_buff *skb = NULL;
@@ -254,6 +235,11 @@
}
if(!skb) {
+ count++;
+ if (once && (count == 1000000)) {
+ printk("out of netpoll skbs!\n");
+ once = 0;
+ }
netpoll_poll(np);
goto repeat;
}
@@ -265,51 +251,17 @@
static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status;
- struct netpoll_info *npinfo;
+ struct net_device *dev = np->dev;
+ struct netpoll_info *npinfo = dev->npinfo;
- if (!np || !np->dev || !netif_device_present(np->dev) || !netif_running(np->dev)) {
- __kfree_skb(skb);
- return;
- }
-
- npinfo = np->dev->npinfo;
+ skb_queue_tail(&npinfo->tx_q, skb);
/* avoid recursion */
if (npinfo->poll_owner == smp_processor_id() ||
- np->dev->xmit_lock_owner == smp_processor_id()) {
- if (np->drop)
- np->drop(skb);
- else
- __kfree_skb(skb);
- return;
- }
-
- do {
- npinfo->tries--;
- netif_tx_lock(np->dev);
-
- /*
- * network drivers do not expect to be called if the queue is
- * stopped.
- */
- status = NETDEV_TX_BUSY;
- if (!netif_queue_stopped(np->dev))
- status = np->dev->hard_start_xmit(skb, np->dev);
-
- netif_tx_unlock(np->dev);
-
- /* success */
- if(!status) {
- npinfo->tries = MAX_RETRIES; /* reset */
- return;
- }
-
- /* transmit busy */
- netpoll_poll(np);
- udelay(50);
- } while (npinfo->tries > 0);
- __kfree_skb(skb);
+ dev->xmit_lock_owner == smp_processor_id())
+ tasklet_schedule(&npinfo->tx_task);
+ else
+ netpoll_run((unsigned long) dev);
}
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -662,9 +614,10 @@
npinfo->rx_np = NULL;
spin_lock_init(&npinfo->poll_lock);
npinfo->poll_owner = -1;
- npinfo->tries = MAX_RETRIES;
spin_lock_init(&npinfo->rx_lock);
skb_queue_head_init(&npinfo->arp_tx);
+ skb_queue_head_init(&npinfo->tx_q);
+ tasklet_init(&npinfo->tx_task, netpoll_run, (unsigned long) ndev);
} else
npinfo = ndev->npinfo;
@@ -772,6 +725,11 @@
npinfo->rx_np = NULL;
npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+ skb_queue_purge(&npinfo->arp_tx);
+ skb_queue_purge(&npinfo->tx_q);
+
+ tasklet_kill(&npinfo->tx_task);
}
dev_put(np->dev);
}
@@ -799,4 +757,3 @@
EXPORT_SYMBOL(netpoll_cleanup);
EXPORT_SYMBOL(netpoll_send_udp);
EXPORT_SYMBOL(netpoll_poll);
-EXPORT_SYMBOL(netpoll_queue);
--
From: Stephen Hemminger <[email protected]>
Date: Thu, 19 Oct 2006 10:15:43 -0700
> The original skb management for netpoll was a mess, it had two queue paths
> and a callback. This changes it to have a per-instance transmit queue
> and use a tasklet rather than a work queue for the congested case.
>
> Signed-off-by: Stephen Hemminger <[email protected]>
I think you mis-diffed this one:
- WARN_ON(skb->protocol == 0);
That line doesn't exist in my copy of net/core/netpoll.c
even with your first patch applied.
Also, you forgot to remove the ->drop callback pointer
from struct netpoll, which you should do if it really
isn't used any more.
I think you might run into problems there, as I believe the netdump
stuff does make non-trivial use of the ->drop callback. Indeed, it
uses the ->dump callback for invoking a special
netpoll_start_netdump() function. I'm pretty sure ->dump was created
specifically to accomodate netdump.
So this is something else which will need to be worked out before we
can apply this patch.
On Fri, 20 Oct 2006 00:15:30 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Thu, 19 Oct 2006 10:15:43 -0700
>
> > The original skb management for netpoll was a mess, it had two queue paths
> > and a callback. This changes it to have a per-instance transmit queue
> > and use a tasklet rather than a work queue for the congested case.
> >
> > Signed-off-by: Stephen Hemminger <[email protected]>
>
> I think you mis-diffed this one:
>
> - WARN_ON(skb->protocol == 0);
>
> That line doesn't exist in my copy of net/core/netpoll.c
> even with your first patch applied.
>
> Also, you forgot to remove the ->drop callback pointer
> from struct netpoll, which you should do if it really
> isn't used any more.
>
> I think you might run into problems there, as I believe the netdump
> stuff does make non-trivial use of the ->drop callback. Indeed, it
> uses the ->dump callback for invoking a special
> netpoll_start_netdump() function. I'm pretty sure ->dump was created
> specifically to accomodate netdump.
>
Netdump is not in the tree, so I can't fix it. Also netdump is pretty
much superseded by kdump.
> So this is something else which will need to be worked out before we
> can apply this patch.
Change netpoll to use a skb queue for transmit. This solves problems
where there are two tx paths that can cause things to get out of order
and different semantics when netconsole output goes through fast and
slow paths.
The only user of the drop hook was netconsole, and I fixed that path.
This probably breaks netdump, but that is out of tree, so it needs
to fix itself.
Signed-off-by: Stephen Hemminger <[email protected]>
---
drivers/net/netconsole.c | 1
include/linux/netpoll.h | 3 +
net/core/netpoll.c | 109 ++++++++++++----------------------------------
3 files changed, 31 insertions(+), 82 deletions(-)
diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
index bf58db2..fad5bf2 100644
--- a/drivers/net/netconsole.c
+++ b/drivers/net/netconsole.c
@@ -60,7 +60,6 @@ static struct netpoll np = {
.local_port = 6665,
.remote_port = 6666,
.remote_mac = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
- .drop = netpoll_queue,
};
static int configured = 0;
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 1efe60c..835ca0d 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -27,11 +27,12 @@ struct netpoll {
struct netpoll_info {
spinlock_t poll_lock;
int poll_owner;
- int tries;
int rx_flags;
spinlock_t rx_lock;
struct netpoll *rx_np; /* netpoll that registered an rx_hook */
struct sk_buff_head arp_tx; /* list of arp requests to reply to */
+ struct sk_buff_head tx_q;
+ struct tasklet_struct tx_task;
};
void netpoll_poll(struct netpoll *np);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 9308af0..22332fd 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -40,10 +40,6 @@ static DEFINE_SPINLOCK(skb_list_lock);
static int nr_skbs;
static struct sk_buff *skbs;
-static DEFINE_SPINLOCK(queue_lock);
-static int queue_depth;
-static struct sk_buff *queue_head, *queue_tail;
-
static atomic_t trapped;
#define NETPOLL_RX_ENABLED 1
@@ -56,50 +52,31 @@ #define MAX_SKB_SIZE \
static void zap_completion_queue(void);
static void arp_reply(struct sk_buff *skb);
-static void queue_process(void *p)
+static void netpoll_run(unsigned long arg)
{
- unsigned long flags;
+ struct net_device *dev = (struct net_device *) arg;
+ struct netpoll_info *npinfo = dev->npinfo;
struct sk_buff *skb;
- while (queue_head) {
- spin_lock_irqsave(&queue_lock, flags);
-
- skb = queue_head;
- queue_head = skb->next;
- if (skb == queue_tail)
- queue_head = NULL;
-
- queue_depth--;
+ while ((skb = skb_dequeue(&npinfo->tx_q))) {
+ if (!netif_running(dev) || !netif_device_present(dev)) {
+ __kfree_skb(skb);
+ continue;
+ }
- spin_unlock_irqrestore(&queue_lock, flags);
+ netif_tx_lock(dev);
+ if (netif_queue_stopped(dev) ||
+ dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ skb_queue_head(&npinfo->tx_q, skb);
+ netif_tx_unlock(dev);
+ tasklet_schedule(&npinfo->tx_task);
+ return;
+ }
- dev_queue_xmit(skb);
+ netif_tx_unlock(dev);
}
}
-static DECLARE_WORK(send_queue, queue_process, NULL);
-
-void netpoll_queue(struct sk_buff *skb)
-{
- unsigned long flags;
-
- if (queue_depth == MAX_QUEUE_DEPTH) {
- __kfree_skb(skb);
- return;
- }
-
- spin_lock_irqsave(&queue_lock, flags);
- if (!queue_head)
- queue_head = skb;
- else
- queue_tail->next = skb;
- queue_tail = skb;
- queue_depth++;
- spin_unlock_irqrestore(&queue_lock, flags);
-
- schedule_work(&send_queue);
-}
-
static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
unsigned short ulen, u32 saddr, u32 daddr)
{
@@ -270,50 +247,17 @@ repeat:
static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status;
- struct netpoll_info *npinfo;
-
- if (!np || !np->dev || !netif_running(np->dev)) {
- __kfree_skb(skb);
- return;
- }
+ struct net_device *dev = np->dev;
+ struct netpoll_info *npinfo = dev->npinfo;
- npinfo = np->dev->npinfo;
+ skb_queue_tail(&npinfo->tx_q, skb);
/* avoid recursion */
if (npinfo->poll_owner == smp_processor_id() ||
- np->dev->xmit_lock_owner == smp_processor_id()) {
- if (np->drop)
- np->drop(skb);
- else
- __kfree_skb(skb);
+ dev->xmit_lock_owner == smp_processor_id())
return;
- }
- do {
- npinfo->tries--;
- netif_tx_lock(np->dev);
-
- /*
- * network drivers do not expect to be called if the queue is
- * stopped.
- */
- status = NETDEV_TX_BUSY;
- if (!netif_queue_stopped(np->dev))
- status = np->dev->hard_start_xmit(skb, np->dev);
-
- netif_tx_unlock(np->dev);
-
- /* success */
- if(!status) {
- npinfo->tries = MAX_RETRIES; /* reset */
- return;
- }
-
- /* transmit busy */
- netpoll_poll(np);
- udelay(50);
- } while (npinfo->tries > 0);
+ netpoll_run((unsigned long) dev);
}
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -666,9 +610,10 @@ int netpoll_setup(struct netpoll *np)
npinfo->rx_np = NULL;
spin_lock_init(&npinfo->poll_lock);
npinfo->poll_owner = -1;
- npinfo->tries = MAX_RETRIES;
spin_lock_init(&npinfo->rx_lock);
skb_queue_head_init(&npinfo->arp_tx);
+ skb_queue_head_init(&npinfo->tx_q);
+ tasklet_init(&npinfo->tx_task, netpoll_run, (unsigned long) ndev);
} else
npinfo = ndev->npinfo;
@@ -776,6 +721,11 @@ void netpoll_cleanup(struct netpoll *np)
npinfo->rx_np = NULL;
npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+ skb_queue_purge(&npinfo->arp_tx);
+ skb_queue_purge(&npinfo->tx_q);
+
+ tasklet_kill(&npinfo->tx_task);
}
dev_put(np->dev);
}
@@ -803,4 +753,3 @@ EXPORT_SYMBOL(netpoll_setup);
EXPORT_SYMBOL(netpoll_cleanup);
EXPORT_SYMBOL(netpoll_send_udp);
EXPORT_SYMBOL(netpoll_poll);
-EXPORT_SYMBOL(netpoll_queue);
--
1.4.2.4
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 08:18:57 -0700
> Netdump is not in the tree, so I can't fix it. Also netdump is pretty
> much superseded by kdump.
Unless kdump is %100 ready you can be sure vendors will ship netdump
for a little while longer. I think gratuitously breaking netdump is
not the best idea.
It's not like netdump is some binary blob you can't get the source
to easily. :-)
On Fri, 20 Oct 2006 12:24:27 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Fri, 20 Oct 2006 08:18:57 -0700
>
> > Netdump is not in the tree, so I can't fix it. Also netdump is pretty
> > much superseded by kdump.
>
> Unless kdump is %100 ready you can be sure vendors will ship netdump
> for a little while longer. I think gratuitously breaking netdump is
> not the best idea.
>
> It's not like netdump is some binary blob you can't get the source
> to easily. :-)
>
Sorry, but why should we treat out-of-tree vendor code any differently
than out-of-tree other code.
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 08:40:15 -0700
> The only user of the drop hook was netconsole, and I fixed that path.
> This probably breaks netdump, but that is out of tree, so it needs
> to fix itself.
I believe that netdump needs to requeue things because dropping the
packet is simply not allowed, and the ->drop callback gives the
netdump code a way to handle things without actually dropping the
packet. If that's true, you can't just free the SKB on it.
Are you sure your new TX strategy can avoid such drops properly?
Please take a quick peek at the netdump code, it's available, and make
some reasonable effort to determine whether it can still work with
your new code.
On Fri, 20 Oct 2006 12:27:53 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Fri, 20 Oct 2006 08:40:15 -0700
>
> > The only user of the drop hook was netconsole, and I fixed that path.
> > This probably breaks netdump, but that is out of tree, so it needs
> > to fix itself.
>
> I believe that netdump needs to requeue things because dropping the
> packet is simply not allowed, and the ->drop callback gives the
> netdump code a way to handle things without actually dropping the
> packet. If that's true, you can't just free the SKB on it.
>
> Are you sure your new TX strategy can avoid such drops properly?
Yes, it has a queue. if it can't send it waits and retries.
>
> Please take a quick peek at the netdump code, it's available, and make
> some reasonable effort to determine whether it can still work with
> your new code.
Where, I'm not digging in side some RHEL rpm patch pile to find it.
--
Stephen Hemminger <[email protected]>
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 12:25:27 -0700
> Sorry, but why should we treat out-of-tree vendor code any
> differently than out-of-tree other code.
I think what netdump was trying to do, provide a way to
requeue instead of fully drop the SKB, is quite reasonable.
Don't you think?
On Fri, 20 Oct 2006 12:52:26 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Fri, 20 Oct 2006 12:25:27 -0700
>
> > Sorry, but why should we treat out-of-tree vendor code any
> > differently than out-of-tree other code.
>
> I think what netdump was trying to do, provide a way to
> requeue instead of fully drop the SKB, is quite reasonable.
> Don't you think?
Yes, but the queued vs non-queued stuff showed up out of order.
The queued messages go through the wrong Tx path. ie. they end up
going into to NIT etc, since the deferred send uses a work queue
it wouldn't work for last-gasp messages or netdump since getting
a work queue to run requires going back to scheduler and processes
running... and it should use skb_buff_head instead
of roll your own queueing.
The other alternative would be to make the send logic non-blocking
and fully push retry to the caller.
I'll make a fix to netdump, if I can find it.
--
Stephen Hemminger <[email protected]>
On Fri, 20 Oct 2006 12:52:26 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Fri, 20 Oct 2006 12:25:27 -0700
>
> > Sorry, but why should we treat out-of-tree vendor code any
> > differently than out-of-tree other code.
>
> I think what netdump was trying to do, provide a way to
> requeue instead of fully drop the SKB, is quite reasonable.
> Don't you think?
Netdump doesn't even exist in the current Fedora source rpm.
I think Dave dropped it.
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 08:40:15 -0700
> -static void queue_process(void *p)
> +static void netpoll_run(unsigned long arg)
> {
...
> - spin_unlock_irqrestore(&queue_lock, flags);
> + netif_tx_lock(dev);
> + if (netif_queue_stopped(dev) ||
> + dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
> + skb_queue_head(&npinfo->tx_q, skb);
> + netif_tx_unlock(dev);
> + tasklet_schedule(&npinfo->tx_task);
> + return;
> + }
We really can't handle TX stopped this way from the netpoll_send_skb()
path. All that old retry logic in netpoll_send_skb() is really
necessary.
If we are in deep IRQ context, took an OOPS, and are trying to get a
netpoll packet out for the kernel log message, we have to try as hard
as possible to get the packet out then and there, even if that means
waiting some amount of time for netif_queue_stopped() to become false.
That is what the existing code is trying to do.
If you defer to a tasklet, the kernel state from the OOPS can be so
corrupted that the tasklet will never run and we'll never get the
netconsole message needed to debug the problem.
Also, if we tasklet schedule from the tasklet, we'll just keep looping
in the tasklet and never leave softirq context, which is also bad
behavior. Even in the tasklet, we should spin and poll when possible
like the current netpoll_send_skb() code does.
So we really can't apply this patch.
On Fri, 20 Oct 2006 13:42:09 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Fri, 20 Oct 2006 08:40:15 -0700
>
> > -static void queue_process(void *p)
> > +static void netpoll_run(unsigned long arg)
> > {
> ...
> > - spin_unlock_irqrestore(&queue_lock, flags);
> > + netif_tx_lock(dev);
> > + if (netif_queue_stopped(dev) ||
> > + dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
> > + skb_queue_head(&npinfo->tx_q, skb);
> > + netif_tx_unlock(dev);
> > + tasklet_schedule(&npinfo->tx_task);
> > + return;
> > + }
>
> We really can't handle TX stopped this way from the netpoll_send_skb()
> path. All that old retry logic in netpoll_send_skb() is really
> necessary.
>
> If we are in deep IRQ context, took an OOPS, and are trying to get a
> netpoll packet out for the kernel log message, we have to try as hard
> as possible to get the packet out then and there, even if that means
> waiting some amount of time for netif_queue_stopped() to become false.
>
But, it also violates the assumptions of the network devices.
It calls NAPI poll back with IRQ's disabled and potentially doesn't
obey the semantics about only running on the same CPU as the
received packet.
> That is what the existing code is trying to do.
>
> If you defer to a tasklet, the kernel state from the OOPS can be so
> corrupted that the tasklet will never run and we'll never get the
> netconsole message needed to debug the problem.
So we can try once and if that fails we have to defer to tasklet.
We can't call NAPI, we can't try and cleanup the device will need
an IRQ to get unblocked.
Or add another device callback that just to handle that case.
> Also, if we tasklet schedule from the tasklet, we'll just keep looping
> in the tasklet and never leave softirq context, which is also bad
> behavior. Even in the tasklet, we should spin and poll when possible
> like the current netpoll_send_skb() code does.
>
> So we really can't apply this patch.
--
Stephen Hemminger <[email protected]>
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 13:48:26 -0700
> On Fri, 20 Oct 2006 13:42:09 -0700 (PDT)
> David Miller <[email protected]> wrote:
>
> > We really can't handle TX stopped this way from the netpoll_send_skb()
> > path. All that old retry logic in netpoll_send_skb() is really
> > necessary.
> >
> > If we are in deep IRQ context, took an OOPS, and are trying to get a
> > netpoll packet out for the kernel log message, we have to try as hard
> > as possible to get the packet out then and there, even if that means
> > waiting some amount of time for netif_queue_stopped() to become false.
>
> But, it also violates the assumptions of the network devices.
> It calls NAPI poll back with IRQ's disabled and potentially doesn't
> obey the semantics about only running on the same CPU as the
> received packet.
Actually, all the locking here is fine, that's why it checks
poll_owner for current smp_processor_id().
Otherwise it is safe to sit and spin waiting for link up/down or
TX full conditions to pass.
> So we can try once and if that fails we have to defer to tasklet.
Not true, we should spin and retry for some reasonable amount of time.
That "reasonable amount of time" should be the maximum of the time
it takes to free up space from a TX full condition and the time it
takes to bring the link up.
That is what the current code is trying to do, and you've erroneously
deleted all of that logic.
> But, it also violates the assumptions of the network devices.
> It calls NAPI poll back with IRQ's disabled and potentially doesn't
> obey the semantics about only running on the same CPU as the
> received packet.
netpoll always played a little fast'n'lose with various locking rules.
Also often inside the drivers are a little sloppy in the poll path.
That's fine for getting an oops out, but risky for normal kernel
messages when the driver must be still working afterwards.
The standard console code makes this conditional on oops_in_progress -
it breaks locks when true and otherwise it follows the safe
approach.
Perhaps it would be better to use different paths in netconsole too
depending on whether oops_in_progress is true or not.
e.g. if !oops_in_progress (use the standard output path)
else use current path.
That would avoid breaking the driver during normal operation
and also have the advantage that the normal netconsole messages
would go through the queueing disciplines etc.
-Andi
From: Andi Kleen <[email protected]>
Date: Fri, 20 Oct 2006 23:01:29 +0200
> netpoll always played a little fast'n'lose with various locking rules.
The current code is fine, it never reenters ->poll, because it
maintains a "->poll_owner" which it checks in netpoll_send_skb()
before trying to call back into ->poll.
Every call to ->poll first sets ->poll_owner to the current cpu id.
netpoll_send_skb() aborts and does a drop if ->poll_owner is set to
the current smp_processor_id().
I sometimes feel like I'm the only person actually reading the sources
and past threads on this topic before replying.
On Friday 20 October 2006 23:08, David Miller wrote:
> From: Andi Kleen <[email protected]>
> Date: Fri, 20 Oct 2006 23:01:29 +0200
>
> > netpoll always played a little fast'n'lose with various locking rules.
>
> The current code is fine, it never reenters ->poll, because it
> maintains a "->poll_owner" which it checks in netpoll_send_skb()
> before trying to call back into ->poll.
I was more thinking of reentry of the interrupt handler in
the driver etc. A lot of them also do printk, but that is fortunately
caught higher level.
-Andi
On Fri, 20 Oct 2006 23:16:03 +0200
Andi Kleen <[email protected]> wrote:
> On Friday 20 October 2006 23:08, David Miller wrote:
> > From: Andi Kleen <[email protected]>
> > Date: Fri, 20 Oct 2006 23:01:29 +0200
> >
> > > netpoll always played a little fast'n'lose with various locking rules.
> >
> > The current code is fine, it never reenters ->poll, because it
> > maintains a "->poll_owner" which it checks in netpoll_send_skb()
> > before trying to call back into ->poll.
>
> I was more thinking of reentry of the interrupt handler in
> the driver etc. A lot of them also do printk, but that is fortunately
> caught higher level.
>
> -Andi
One problem is that with netconsole the NAPI poll routine can be
called with IRQ's disabled. This means that calls to dev_kfree_skb()
are incorrect in this context. The driver would have to use dev_kfree_skb_any()
instead.
For most normal sends, the netpoll code was using the dev->hard_start_xmit
directly. If it got busy, it would process later, but this code path uses
dev_queue_xmit() and that code would send the skb through NIT and other
stuff that netpoll doesn't want.
Signed-off-by: Stephen Hemminger <[email protected]>
--- netpoll.orig/net/core/netpoll.c
+++ netpoll/net/core/netpoll.c
@@ -51,17 +51,28 @@ static atomic_t trapped;
static void zap_completion_queue(void);
static void arp_reply(struct sk_buff *skb);
+static void queue_process(void *);
+static DECLARE_WORK(send_queue, queue_process, NULL);
+
static void queue_process(void *p)
{
struct sk_buff *skb;
- while ((skb = skb_dequeue(&netpoll_txq)))
- dev_queue_xmit(skb);
+ while ((skb = skb_dequeue(&netpoll_txq))) {
+ struct net_device *dev = skb->dev;
+ netif_tx_lock_bh(dev);
+ if (netif_queue_stopped(dev) ||
+ dev->hard_start_xmit(skb, dev) != NETDEV_TX_OK) {
+ skb_queue_head(&netpoll_txq, skb);
+ netif_tx_unlock_bh(dev);
+ schedule_delayed_work(&send_queue, 1);
+ break;
+ }
+ netif_tx_unlock_bh(dev);
+ }
}
-static DECLARE_WORK(send_queue, queue_process, NULL);
-
void netpoll_queue(struct sk_buff *skb)
{
skb_queue_tail(&netpoll_txq, skb);
This is similar in spirit to earlier patches with less changes.
Get rid of DIY queue for skb's used in netpoll. Add missing code to cleanup
queue on shutdown.
Signed-off-by: Stephen Hemminger <[email protected]>
--- netpoll.orig/net/core/netpoll.c
+++ netpoll/net/core/netpoll.c
@@ -37,10 +37,7 @@
#define MAX_RETRIES 20000
static struct sk_buff_head netpoll_pool;
-
-static DEFINE_SPINLOCK(queue_lock);
-static int queue_depth;
-static struct sk_buff *queue_head, *queue_tail;
+static struct sk_buff_head netpoll_txq;
static atomic_t trapped;
@@ -56,44 +53,18 @@ static void arp_reply(struct sk_buff *sk
static void queue_process(void *p)
{
- unsigned long flags;
struct sk_buff *skb;
- while (queue_head) {
- spin_lock_irqsave(&queue_lock, flags);
-
- skb = queue_head;
- queue_head = skb->next;
- if (skb == queue_tail)
- queue_head = NULL;
-
- queue_depth--;
-
- spin_unlock_irqrestore(&queue_lock, flags);
-
+ while ((skb = skb_dequeue(&netpoll_txq)))
dev_queue_xmit(skb);
- }
+
}
static DECLARE_WORK(send_queue, queue_process, NULL);
void netpoll_queue(struct sk_buff *skb)
{
- unsigned long flags;
-
- if (queue_depth == MAX_QUEUE_DEPTH) {
- __kfree_skb(skb);
- return;
- }
-
- spin_lock_irqsave(&queue_lock, flags);
- if (!queue_head)
- queue_head = skb;
- else
- queue_tail->next = skb;
- queue_tail = skb;
- queue_depth++;
- spin_unlock_irqrestore(&queue_lock, flags);
+ skb_queue_tail(&netpoll_txq, skb);
schedule_work(&send_queue);
}
@@ -745,6 +716,7 @@ int netpoll_setup(struct netpoll *np)
}
static int __init netpoll_init(void) {
+ skb_queue_head_init(&netpoll_txq);
skb_queue_head_init(&netpoll_pool);
return 0;
}
@@ -752,20 +724,30 @@ core_initcall(netpoll_init);
void netpoll_cleanup(struct netpoll *np)
{
- struct netpoll_info *npinfo;
+ struct net_device *dev = np->dev;
+ struct sk_buff *skb, *next;
unsigned long flags;
- if (np->dev) {
- npinfo = np->dev->npinfo;
+ if (dev) {
+ struct netpoll_info *npinfo = dev->npinfo;
+
if (npinfo && npinfo->rx_np == np) {
spin_lock_irqsave(&npinfo->rx_lock, flags);
npinfo->rx_np = NULL;
npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
}
- dev_put(np->dev);
+ dev_put(dev);
+
+ spin_lock_irqsave(&netpoll_txq.lock, flags);
+ for (skb = (struct sk_buff *)netpoll_txq.next;
+ skb != (struct sk_buff *)&netpoll_txq; skb = next) {
+ next = skb->next;
+ if (skb->dev == dev)
+ skb_unlink(skb, &netpoll_txq);
+ }
+ spin_unlock_irqrestore(&netpoll_txq.lock, flags);
}
-
np->dev = NULL;
}
Change retry logic of netpoll. Always requeue if unable to send
instead of dropping. Make retry counter per send rather than causing
mass migration when tries gets less than zero. Since callers are
either netpoll_send_arp or netpoll_send_udp, we knwo that np and np->dev
can't be null.
Signed-off-by: Stephen Hemminger <[email protected]>
--- netpoll.orig/include/linux/netpoll.h
+++ netpoll/include/linux/netpoll.h
@@ -27,7 +27,6 @@ struct netpoll {
struct netpoll_info {
spinlock_t poll_lock;
int poll_owner;
- int tries;
int rx_flags;
spinlock_t rx_lock;
struct netpoll *rx_np; /* netpoll that registered an rx_hook */
--- netpoll.orig/net/core/netpoll.c
+++ netpoll/net/core/netpoll.c
@@ -232,50 +232,43 @@ static struct sk_buff * find_skb(struct
static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status;
- struct netpoll_info *npinfo;
-
- if (!np || !np->dev || !netif_running(np->dev))
- goto free_skb;
+ struct net_device *dev = np->dev;
+ struct netpoll_info *npinfo = dev->npinfo;
+ int status, tries;
- npinfo = np->dev->npinfo;
+ if (!netif_running(dev) || !netif_device_present(dev))
+ __kfree_skb(skb);
/* avoid recursion */
if (npinfo->poll_owner == smp_processor_id() ||
- np->dev->xmit_lock_owner == smp_processor_id()) {
- if (np->drop)
- np->drop(skb);
- else
- __kfree_skb(skb);
- return;
- }
-
- do {
- npinfo->tries--;
- netif_tx_lock(np->dev);
+ dev->xmit_lock_owner == smp_processor_id())
+ goto busy;
+ for (tries = MAX_RETRIES; tries; --tries) {
/*
* network drivers do not expect to be called if the queue is
* stopped.
*/
- status = NETDEV_TX_BUSY;
- if (!netif_queue_stopped(np->dev))
- status = np->dev->hard_start_xmit(skb, np->dev);
-
- netif_tx_unlock(np->dev);
+ if (netif_queue_stopped(dev))
+ status = NETDEV_TX_BUSY;
+ else
+ status = dev->hard_start_xmit(skb, dev);
+ netif_tx_unlock(dev);
/* success */
- if(!status) {
- npinfo->tries = MAX_RETRIES; /* reset */
+ if(status == NETDEV_TX_OK)
return;
- }
/* transmit busy */
netpoll_poll(np);
udelay(50);
- } while (npinfo->tries > 0);
-free_skb:
- __kfree_skb(skb);
+ }
+
+busy:
+ if (np->drop)
+ np->drop(skb);
+ else
+ __kfree_skb(skb);
}
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -628,7 +621,7 @@ int netpoll_setup(struct netpoll *np)
npinfo->rx_np = NULL;
spin_lock_init(&npinfo->poll_lock);
npinfo->poll_owner = -1;
- npinfo->tries = MAX_RETRIES;
+
spin_lock_init(&npinfo->rx_lock);
skb_queue_head_init(&npinfo->arp_tx);
} else
On Fri, Oct 20, 2006 at 01:25:32PM -0700, Stephen Hemminger wrote:
> On Fri, 20 Oct 2006 12:52:26 -0700 (PDT)
> David Miller <[email protected]> wrote:
>
> > From: Stephen Hemminger <[email protected]>
> > Date: Fri, 20 Oct 2006 12:25:27 -0700
> >
> > > Sorry, but why should we treat out-of-tree vendor code any
> > > differently than out-of-tree other code.
> >
> > I think what netdump was trying to do, provide a way to
> > requeue instead of fully drop the SKB, is quite reasonable.
> > Don't you think?
>
>
> Netdump doesn't even exist in the current Fedora source rpm.
> I think Dave dropped it.
Indeed. Practically no-one cared about it, so it bit-rotted
really fast after we shipped RHEL4. That, along with the focus
shifting to making kdump work seemed to kill it off over the last
12 months.
Dave
--
http://www.codemonkey.org.uk
From: Dave Jones <[email protected]>
Date: Sat, 21 Oct 2006 01:00:16 -0400
> Practically no-one cared about it, so it bit-rotted really fast
> after we shipped RHEL4. That, along with the focus shifting to
> making kdump work seemed to kill it off over the last 12 months.
Then we can truly kill off the ->drop() callback as part
of Stephen's patches.
Stephen, I'll review your new set over the weekend.
From: Stephen Hemminger <[email protected]>
Date: Fri, 20 Oct 2006 15:30:27 -0700
> + spin_lock_irqsave(&netpoll_txq.lock, flags);
> + for (skb = (struct sk_buff *)netpoll_txq.next;
> + skb != (struct sk_buff *)&netpoll_txq; skb = next) {
> + next = skb->next;
> + if (skb->dev == dev)
> + skb_unlink(skb, &netpoll_txq);
> + }
> + spin_unlock_irqrestore(&netpoll_txq.lock, flags);
> }
All SKBs removed here will be leaked, nothing frees them up.
Since #2 and #3 depend upon this patch, I'll hold off on those
until you fix this bug.
Thanks.
Trivial cleanup of netpoll interface. Use constants
for size, to make usage clear.
Signed-off-by: Stephen Hemminger <[email protected]>
--- netpoll.orig/include/linux/netpoll.h
+++ netpoll/include/linux/netpoll.h
@@ -12,15 +12,14 @@
#include <linux/rcupdate.h>
#include <linux/list.h>
-struct netpoll;
-
struct netpoll {
struct net_device *dev;
- char dev_name[16], *name;
+ char dev_name[IFNAMSIZ];
+ const char *name;
void (*rx_hook)(struct netpoll *, int, char *, int);
u32 local_ip, remote_ip;
u16 local_port, remote_port;
- unsigned char local_mac[6], remote_mac[6];
+ u8 local_mac[ETH_ALEN], remote_mac[ETH_ALEN];
};
struct netpoll_info {
This is the 3rd version of the netpoll patches. The first patch
switches from open-coded skb list to using a skb_buff_head.
It also flushes packets from queue when device is removed from
netpoll.
Signed-off-by: Stephen Hemminger <[email protected]>
--- netpoll.orig/net/core/netpoll.c
+++ netpoll/net/core/netpoll.c
@@ -37,10 +37,7 @@
#define MAX_RETRIES 20000
static struct sk_buff_head netpoll_pool;
-
-static DEFINE_SPINLOCK(queue_lock);
-static int queue_depth;
-static struct sk_buff *queue_head, *queue_tail;
+static struct sk_buff_head netpoll_txq;
static atomic_t trapped;
@@ -56,44 +53,35 @@ static void arp_reply(struct sk_buff *sk
static void queue_process(void *p)
{
- unsigned long flags;
struct sk_buff *skb;
- while (queue_head) {
- spin_lock_irqsave(&queue_lock, flags);
-
- skb = queue_head;
- queue_head = skb->next;
- if (skb == queue_tail)
- queue_head = NULL;
+ while ((skb = skb_dequeue(&netpoll_txq)))
+ dev_queue_xmit(skb);
- queue_depth--;
+}
- spin_unlock_irqrestore(&queue_lock, flags);
+static void queue_purge(struct net_device *dev)
+{
+ struct sk_buff *skb, *next;
+ unsigned long flags;
- dev_queue_xmit(skb);
+ spin_lock_irqsave(&netpoll_txq.lock, flags);
+ for (skb = (struct sk_buff *)netpoll_txq.next;
+ skb != (struct sk_buff *)&netpoll_txq; skb = next) {
+ next = skb->next;
+ if (skb->dev == dev) {
+ skb_unlink(skb, &netpoll_txq);
+ kfree_skb(skb);
+ }
}
+ spin_unlock_irqrestore(&netpoll_txq.lock, flags);
}
static DECLARE_WORK(send_queue, queue_process, NULL);
void netpoll_queue(struct sk_buff *skb)
{
- unsigned long flags;
-
- if (queue_depth == MAX_QUEUE_DEPTH) {
- __kfree_skb(skb);
- return;
- }
-
- spin_lock_irqsave(&queue_lock, flags);
- if (!queue_head)
- queue_head = skb;
- else
- queue_tail->next = skb;
- queue_tail = skb;
- queue_depth++;
- spin_unlock_irqrestore(&queue_lock, flags);
+ skb_queue_tail(&netpoll_txq, skb);
schedule_work(&send_queue);
}
@@ -745,6 +733,7 @@ int netpoll_setup(struct netpoll *np)
}
static int __init netpoll_init(void) {
+ skb_queue_head_init(&netpoll_txq);
skb_queue_head_init(&netpoll_pool);
return 0;
}
@@ -752,20 +741,22 @@ core_initcall(netpoll_init);
void netpoll_cleanup(struct netpoll *np)
{
- struct netpoll_info *npinfo;
+ struct net_device *dev = np->dev;
unsigned long flags;
- if (np->dev) {
- npinfo = np->dev->npinfo;
+ if (dev) {
+ struct netpoll_info *npinfo = dev->npinfo;
+
if (npinfo && npinfo->rx_np == np) {
spin_lock_irqsave(&npinfo->rx_lock, flags);
npinfo->rx_np = NULL;
npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
}
- dev_put(np->dev);
- }
+ dev_put(dev);
+ queue_purge(dev);
+ }
np->dev = NULL;
}
From: Stephen Hemminger <[email protected]>
Date: Mon, 23 Oct 2006 12:02:53 -0700
> + spin_lock_irqsave(&netpoll_txq.lock, flags);
> + for (skb = (struct sk_buff *)netpoll_txq.next;
> + skb != (struct sk_buff *)&netpoll_txq; skb = next) {
> + next = skb->next;
> + if (skb->dev == dev) {
> + skb_unlink(skb, &netpoll_txq);
> + kfree_skb(skb);
> + }
> }
> + spin_unlock_irqrestore(&netpoll_txq.lock, flags);
IRQ's are disabled, I think we can't call kfree_skb() in such a
context.
That's why zap_completion_queue() has all of these funny
skb->destructor checks and such, all of this stuff potentially runs in
IRQ context.
On Mon, 23 Oct 2006 23:03:50 -0700 (PDT)
David Miller <[email protected]> wrote:
> From: Stephen Hemminger <[email protected]>
> Date: Mon, 23 Oct 2006 12:02:53 -0700
>
> > + spin_lock_irqsave(&netpoll_txq.lock, flags);
> > + for (skb = (struct sk_buff *)netpoll_txq.next;
> > + skb != (struct sk_buff *)&netpoll_txq; skb = next) {
> > + next = skb->next;
> > + if (skb->dev == dev) {
> > + skb_unlink(skb, &netpoll_txq);
> > + kfree_skb(skb);
> > + }
> > }
> > + spin_unlock_irqrestore(&netpoll_txq.lock, flags);
>
> IRQ's are disabled, I think we can't call kfree_skb() in such a
> context.
It is save since the skb's only come from this code (no destructors).
>
> That's why zap_completion_queue() has all of these funny
> skb->destructor checks and such, all of this stuff potentially runs in
> IRQ context.
It should use __kfree_skb in the purge routine (like other places).
On Tue, 24 Oct 2006 07:51:30 -0700
Stephen Hemminger <[email protected]> wrote:
> On Mon, 23 Oct 2006 23:03:50 -0700 (PDT)
> David Miller <[email protected]> wrote:
>
> > From: Stephen Hemminger <[email protected]>
> > Date: Mon, 23 Oct 2006 12:02:53 -0700
> >
> > > + spin_lock_irqsave(&netpoll_txq.lock, flags);
> > > + for (skb = (struct sk_buff *)netpoll_txq.next;
> > > + skb != (struct sk_buff *)&netpoll_txq; skb = next) {
> > > + next = skb->next;
> > > + if (skb->dev == dev) {
> > > + skb_unlink(skb, &netpoll_txq);
> > > + kfree_skb(skb);
> > > + }
> > > }
> > > + spin_unlock_irqrestore(&netpoll_txq.lock, flags);
> >
> > IRQ's are disabled, I think we can't call kfree_skb() in such a
> > context.
>
> It is save since the skb's only come from this code (no destructors).
>
Actually it does use destructors... I am doing a better version (per-device).