2009-06-16 19:44:20

by Paul Fulghum

[permalink] [raw]
Subject: [PATCH] synclink_gt fix transmit race and timeout

Fix race condition when adding transmit data to active DMA buffer ring
that can cause transmit stall.
Update transmit timeout when adding data to active DMA buffer ring.
Base transmit timeout on amount of buffered data instead of
using fixed value.

Signed-off-by: Paul Fulghum <[email protected]>

--- a/drivers/char/synclink_gt.c 2009-06-09 22:05:27.000000000 -0500
+++ b/drivers/char/synclink_gt.c 2009-06-16 13:58:14.000000000 -0500
@@ -463,7 +463,6 @@ static unsigned int free_tbuf_count(stru
static unsigned int tbuf_bytes(struct slgt_info *info);
static void reset_tbufs(struct slgt_info *info);
static void tdma_reset(struct slgt_info *info);
-static void tdma_start(struct slgt_info *info);
static void tx_load(struct slgt_info *info, const char *buf, unsigned int count);

static void get_signals(struct slgt_info *info);
@@ -791,6 +790,18 @@ static void set_termios(struct tty_struc
}
}

+static void update_tx_timer(struct slgt_info *info)
+{
+ /*
+ * use worst case speed of 1200bps to calculate transmit timeout
+ * based on data in buffers (tbuf_bytes) and FIFO (128 bytes)
+ */
+ if (info->params.mode == MGSL_MODE_HDLC) {
+ int timeout = (tbuf_bytes(info) * 7) + 1000;
+ mod_timer(&info->tx_timer, jiffies + msecs_to_jiffies(timeout));
+ }
+}
+
static int write(struct tty_struct *tty,
const unsigned char *buf, int count)
{
@@ -834,8 +845,18 @@ start:
spin_lock_irqsave(&info->lock,flags);
if (!info->tx_active)
tx_start(info);
- else
- tdma_start(info);
+ else if (!(rd_reg32(info, TDCSR) & BIT0)) {
+ /* transmit still active but transmit DMA stopped */
+ unsigned int i = info->tbuf_current;
+ if (!i)
+ i = info->tbuf_count;
+ i--;
+ /* if DMA buf unsent must try later after tx idle */
+ if (desc_count(info->tbufs[i]))
+ ret = 0;
+ }
+ if (ret > 0)
+ update_tx_timer(info);
spin_unlock_irqrestore(&info->lock,flags);
}

@@ -1498,10 +1519,9 @@ static int hdlcdev_xmit(struct sk_buff *
/* save start time for transmit timeout detection */
dev->trans_start = jiffies;

- /* start hardware transmitter if necessary */
spin_lock_irqsave(&info->lock,flags);
- if (!info->tx_active)
- tx_start(info);
+ tx_start(info);
+ update_tx_timer(info);
spin_unlock_irqrestore(&info->lock,flags);

return 0;
@@ -3886,50 +3906,19 @@ static void tx_start(struct slgt_info *i
slgt_irq_on(info, IRQ_TXUNDER + IRQ_TXIDLE);
/* clear tx idle and underrun status bits */
wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
- if (info->params.mode == MGSL_MODE_HDLC)
- mod_timer(&info->tx_timer, jiffies +
- msecs_to_jiffies(5000));
} else {
slgt_irq_off(info, IRQ_TXDATA);
slgt_irq_on(info, IRQ_TXIDLE);
/* clear tx idle status bit */
wr_reg16(info, SSR, IRQ_TXIDLE);
}
- tdma_start(info);
+ /* set 1st descriptor address and start DMA */
+ wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
+ wr_reg32(info, TDCSR, BIT2 + BIT0);
info->tx_active = true;
}
}

-/*
- * start transmit DMA if inactive and there are unsent buffers
- */
-static void tdma_start(struct slgt_info *info)
-{
- unsigned int i;
-
- if (rd_reg32(info, TDCSR) & BIT0)
- return;
-
- /* transmit DMA inactive, check for unsent buffers */
- i = info->tbuf_start;
- while (!desc_count(info->tbufs[i])) {
- if (++i == info->tbuf_count)
- i = 0;
- if (i == info->tbuf_current)
- return;
- }
- info->tbuf_start = i;
-
- /* there are unsent buffers, start transmit DMA */
-
- /* reset needed if previous error condition */
- tdma_reset(info);
-
- /* set 1st descriptor address */
- wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
- wr_reg32(info, TDCSR, BIT2 + BIT0); /* IRQ + DMA enable */
-}
-
static void tx_stop(struct slgt_info *info)
{
unsigned short val;
@@ -4942,8 +4931,7 @@ static void tx_timeout(unsigned long con
info->icount.txtimeout++;
}
spin_lock_irqsave(&info->lock,flags);
- info->tx_active = false;
- info->tx_count = 0;
+ tx_stop(info);
spin_unlock_irqrestore(&info->lock,flags);

#if SYNCLINK_GENERIC_HDLC


2009-06-23 06:20:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

On Tue, 16 Jun 2009 14:44:11 -0500 Paul Fulghum <[email protected]> wrote:

> Fix race condition when adding transmit data to active DMA buffer ring
> that can cause transmit stall.
> Update transmit timeout when adding data to active DMA buffer ring.
> Base transmit timeout on amount of buffered data instead of
> using fixed value.
>

It's not a terribly good changelog, sorry.

It fails to describe the race condition?

It fails to explain the user-visible effects of the bug which was
fixed. Those who need to make should-we-backport-this-to-stable
decisions need to know this. Often we are told, often we can guess.
Sometimes neither.


>
> --- a/drivers/char/synclink_gt.c 2009-06-09 22:05:27.000000000 -0500
> +++ b/drivers/char/synclink_gt.c 2009-06-16 13:58:14.000000000 -0500
> @@ -463,7 +463,6 @@ static unsigned int free_tbuf_count(stru
> static unsigned int tbuf_bytes(struct slgt_info *info);
> static void reset_tbufs(struct slgt_info *info);
> static void tdma_reset(struct slgt_info *info);
> -static void tdma_start(struct slgt_info *info);
> static void tx_load(struct slgt_info *info, const char *buf, unsigned int count);
>
> static void get_signals(struct slgt_info *info);
> @@ -791,6 +790,18 @@ static void set_termios(struct tty_struc
> }
> }
>
> +static void update_tx_timer(struct slgt_info *info)
> +{
> + /*
> + * use worst case speed of 1200bps to calculate transmit timeout
> + * based on data in buffers (tbuf_bytes) and FIFO (128 bytes)
> + */
> + if (info->params.mode == MGSL_MODE_HDLC) {
> + int timeout = (tbuf_bytes(info) * 7) + 1000;
> + mod_timer(&info->tx_timer, jiffies + msecs_to_jiffies(timeout));
> + }
> +}

Where did the "7" come from?

> static int write(struct tty_struct *tty,
> const unsigned char *buf, int count)
> {
> @@ -834,8 +845,18 @@ start:
> spin_lock_irqsave(&info->lock,flags);
> if (!info->tx_active)
> tx_start(info);
> - else
> - tdma_start(info);
> + else if (!(rd_reg32(info, TDCSR) & BIT0)) {
> + /* transmit still active but transmit DMA stopped */
> + unsigned int i = info->tbuf_current;
> + if (!i)
> + i = info->tbuf_count;
> + i--;
> + /* if DMA buf unsent must try later after tx idle */
> + if (desc_count(info->tbufs[i]))
> + ret = 0;
> + }
> + if (ret > 0)
> + update_tx_timer(info);
> spin_unlock_irqrestore(&info->lock,flags);
> }
>
> @@ -1498,10 +1519,9 @@ static int hdlcdev_xmit(struct sk_buff *
> /* save start time for transmit timeout detection */
> dev->trans_start = jiffies;
>
> - /* start hardware transmitter if necessary */
> spin_lock_irqsave(&info->lock,flags);
> - if (!info->tx_active)
> - tx_start(info);
> + tx_start(info);
> + update_tx_timer(info);
> spin_unlock_irqrestore(&info->lock,flags);
>
> return 0;
> @@ -3886,50 +3906,19 @@ static void tx_start(struct slgt_info *i
> slgt_irq_on(info, IRQ_TXUNDER + IRQ_TXIDLE);
> /* clear tx idle and underrun status bits */
> wr_reg16(info, SSR, (unsigned short)(IRQ_TXIDLE + IRQ_TXUNDER));
> - if (info->params.mode == MGSL_MODE_HDLC)
> - mod_timer(&info->tx_timer, jiffies +
> - msecs_to_jiffies(5000));
> } else {
> slgt_irq_off(info, IRQ_TXDATA);
> slgt_irq_on(info, IRQ_TXIDLE);
> /* clear tx idle status bit */
> wr_reg16(info, SSR, IRQ_TXIDLE);
> }
> - tdma_start(info);
> + /* set 1st descriptor address and start DMA */
> + wr_reg32(info, TDDAR, info->tbufs[info->tbuf_start].pdesc);
> + wr_reg32(info, TDCSR, BIT2 + BIT0);
> info->tx_active = true;
> }
> }
>
> ...
>
> @@ -4942,8 +4931,7 @@ static void tx_timeout(unsigned long con
> info->icount.txtimeout++;
> }
> spin_lock_irqsave(&info->lock,flags);
> - info->tx_active = false;
> - info->tx_count = 0;
> + tx_stop(info);

I have a suspicion that tx_stop() should use del_timer_sync(), not
del_timer(). What happens if the timer handler is concurrently
running?

2009-06-23 15:18:49

by Paul Fulghum

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

On Mon, 2009-06-22 at 23:19 -0700, Andrew Morton wrote:
> On Tue, 16 Jun 2009 14:44:11 -0500 Paul Fulghum <[email protected]> wrote:
>
> > Fix race condition when adding transmit data to active DMA buffer ring
> > that can cause transmit stall.

> It's not a terribly good changelog, sorry.
> It fails to describe the race condition?

I attempted to say what was done in the change log and
let the patch describe the details of how it was done.
But I can see that might not be enough.

How's this? :

If after adding transmit data to the transmit DMA ring the
transmit DMA controller has become inactive before reading
the new data and the serial transmitter is still active sending
data already in the transmit FIFO and/or shift register,
then wait for the serial transmitter to become idle before
reactivating the transmit DMA controller.

Otherwise the transmit DMA controller may present the
new data to the serial transmitter in a small timing window
just as the serial transmitter transitions to the idle state.
Hitting this window results in the serial transmitter entering
an inconsistent state where it does not process the new data
and does not signal the transition to the idle state.

>From the driver perspective, this condition appears as an
active transmit DMA controller (correct) with unsent DMA buffers
(correct) and an active serial transmitter (wrong), even though
no data is actually being sent. When all the DMA buffers are full,
no further data is accepted from a user application calling
write() until a timeout occurs and the transmitter and
DMA controller are reset.

--

If this is good enough, I will resubmit the patch with
the updated change log.

--
Paul Fulghum
Microgate Systems, Ltd

2009-06-23 16:28:34

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

On Tue, 23 Jun 2009 10:16:01 -0500 Paul Fulghum <[email protected]> wrote:

> On Mon, 2009-06-22 at 23:19 -0700, Andrew Morton wrote:
> > On Tue, 16 Jun 2009 14:44:11 -0500 Paul Fulghum <[email protected]> wrote:
> >
> > > Fix race condition when adding transmit data to active DMA buffer ring
> > > that can cause transmit stall.
>
> > It's not a terribly good changelog, sorry.
> > It fails to describe the race condition?
>
> I attempted to say what was done in the change log and
> let the patch describe the details of how it was done.
> But I can see that might not be enough.
>
> How's this? :
>
> If after adding transmit data to the transmit DMA ring the
> transmit DMA controller has become inactive before reading
> the new data and the serial transmitter is still active sending
> data already in the transmit FIFO and/or shift register,
> then wait for the serial transmitter to become idle before
> reactivating the transmit DMA controller.
>
> Otherwise the transmit DMA controller may present the
> new data to the serial transmitter in a small timing window
> just as the serial transmitter transitions to the idle state.
> Hitting this window results in the serial transmitter entering
> an inconsistent state where it does not process the new data
> and does not signal the transition to the idle state.
>
> >From the driver perspective, this condition appears as an
> active transmit DMA controller (correct) with unsent DMA buffers
> (correct) and an active serial transmitter (wrong), even though
> no data is actually being sent. When all the DMA buffers are full,
> no further data is accepted from a user application calling
> write() until a timeout occurs and the transmitter and
> DMA controller are reset.
>

That covers it ;) But I stil lcan't work out whether we shoul backport
this into -stable.

>
> If this is good enough, I will resubmit the patch with
> the updated change log.

I updated the patch.

I did have a couple of other comments on the patch which seem to have
been missed?

2009-06-23 16:55:03

by Paul Fulghum

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

Andrew Morton wrote:
> I did have a couple of other comments on the patch which seem to have
> been missed?

Let's see, there was:
"It fails to explain the user-visible effects of the bug which was fixed."

On Tue, 23 Jun 2009 10:16:01 -0500 Paul Fulghum <[email protected]> wrote:
> When all the DMA buffers are full,
> no further data is accepted from a user application calling
> write() until a timeout occurs and the transmitter and
> DMA controller are reset.

That addresses the user-visible effects.

Then there was:
"Those who need to make should-we-backport-this-to-stable
decisions need to know this. Often we are told, often we can guess."

My opinion on if this should be back ported to a stable branch:

Probably not. It effects a small minority of users
in rare instances with non-fatal consequences, just reduced
performance. Updated drivers for all 2.6 and 2.4 kernels are already
available on our website. If stable maintainers want to back port it
I'm all for it, but they are likely busy enough with critical changes.

--
Paul Fulghum
MicroGate Systems, Ltd.
=Customer Driven, by Design=
(800)444-1982
(512)345-7791 (Direct)
(512)343-9046 (Fax)
Central Time Zone (GMT -5h)
http://www.microgate.com

2009-06-23 17:02:42

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

On Tue, 23 Jun 2009 11:55:06 -0600 Paul Fulghum <[email protected]> wrote:

> Andrew Morton wrote:
> > I did have a couple of other comments on the patch which seem to have
> > been missed?
>
> Let's see, there was:
> "It fails to explain the user-visible effects of the bug which was fixed."

I'm referring to

Where did the "7" come from?

and

I have a suspicion that tx_stop() should use del_timer_sync(), not
del_timer(). What happens if the timer handler is concurrently
running?

2009-06-23 17:29:20

by Paul Fulghum

[permalink] [raw]
Subject: Re: [PATCH] synclink_gt fix transmit race and timeout

Andrew Morton wrote:
> I'm referring to
>
> Where did the "7" come from?

Oops, I did not scroll down far enough.

+ /*
+ * use worst case speed of 1200bps to calculate transmit timeout
+ * based on data in buffers (tbuf_bytes) and FIFO (128 bytes)
+ */
+ if (info->params.mode == MGSL_MODE_HDLC) {
+ int timeout = (tbuf_bytes(info) * 7) + 1000;
+ mod_timer(&info->tx_timer, jiffies + msecs_to_jiffies(timeout));
+ }

7 is roughly the number of milliseconds to send a byte at 1200bps.

The problem with externally provided data clocks is that you
don't necessarily know the data rate before hand, so a somewhat
arbitrary worst case assumption is used.

> and
>
> I have a suspicion that tx_stop() should use del_timer_sync(), not
> del_timer(). What happens if the timer handler is concurrently
> running?

Everything is synchronized with info->lock spin_lock,
so nothing critical runs concurrently. tx_stop() is sometimes
called in interrupt context so it can't call del_timer_sync().
If the timer has already fired but has not
run yet it does nothing more than call tx_stop() itself and wake
any transmit waiters so there are no ill effects.

--
Paul Fulghum
MicroGate Systems, Ltd.
=Customer Driven, by Design=
(800)444-1982
(512)345-7791 (Direct)
(512)343-9046 (Fax)
Central Time Zone (GMT -5h)
http://www.microgate.com