Nouveau takes down my system quite reliably when any hotplug event occurs.
The bug happens because the IRQ handler didn't acknowledge the hotplug
state until the bottom half, so the card generated a new interrupt
immediately, starving the bottom half and permanently starving that CPU
(and hence the bottom half).
Even with this fix, a lot of the IRQ code looks rather broken.
This is tested on 2.6.36 (and makes the system stable for me), but it also
applies cleanly to 2.6.37 (untested, but surely also necessary). Fedora 14's
2.6.35 kernels seem to have to same problem for me, so I suspect that 2.6.35
needs this fix as well. (All of my tests are on an NV50 card.)
Changes from v1:
- Ignore unrequested hotplug bits (I accidentally removed that part).
- Support newer hardware (untested -- Ben, can you check this?)
Andy Lutomirski (2):
Use existing defines for NV50 hotplug registers
nouveau: Acknowledge HPD irq in handler, not bottom half
drivers/gpu/drm/nouveau/nouveau_drv.h | 6 +++++
drivers/gpu/drm/nouveau/nouveau_irq.c | 1 +
drivers/gpu/drm/nouveau/nv50_display.c | 39 +++++++++++++++++++++++---------
3 files changed, 35 insertions(+), 11 deletions(-)
--
1.7.3.2
This doesn't change code at all, but it makes it a lot easier
to understand.
Signed-off-by: Andy Lutomirski <[email protected]>
---
drivers/gpu/drm/nouveau/nv50_display.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c
index 612fa6d..83a7d27 100644
--- a/drivers/gpu/drm/nouveau/nv50_display.c
+++ b/drivers/gpu/drm/nouveau/nv50_display.c
@@ -453,8 +453,8 @@ static int nv50_display_disable(struct drm_device *dev)
nv_wr32(dev, NV50_PDISPLAY_INTR_EN, 0x00000000);
/* disable hotplug interrupts */
- nv_wr32(dev, 0xe054, 0xffffffff);
- nv_wr32(dev, 0xe050, 0x00000000);
+ nv_wr32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL, 0xffffffff);
+ nv_wr32(dev, NV50_PCONNECTOR_HOTPLUG_INTR, 0x00000000);
if (dev_priv->chipset >= 0x90) {
nv_wr32(dev, 0xe074, 0xffffffff);
nv_wr32(dev, 0xe070, 0x00000000);
@@ -1014,7 +1014,7 @@ nv50_display_irq_hotplug_bh(struct work_struct *work)
uint32_t unplug_mask, plug_mask, change_mask;
uint32_t hpd0, hpd1 = 0;
- hpd0 = nv_rd32(dev, 0xe054) & nv_rd32(dev, 0xe050);
+ hpd0 = nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL) & nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_INTR);
if (dev_priv->chipset >= 0x90)
hpd1 = nv_rd32(dev, 0xe074) & nv_rd32(dev, 0xe070);
@@ -1058,7 +1058,7 @@ nv50_display_irq_hotplug_bh(struct work_struct *work)
helper->dpms(connector->encoder, DRM_MODE_DPMS_OFF);
}
- nv_wr32(dev, 0xe054, nv_rd32(dev, 0xe054));
+ nv_wr32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL, nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL));
if (dev_priv->chipset >= 0x90)
nv_wr32(dev, 0xe074, nv_rd32(dev, 0xe074));
--
1.7.3.2
The old code generated an interrupt storm bad enough to completely
take down my system.
Signed-off-by: Andy Lutomirski <[email protected]>
---
drivers/gpu/drm/nouveau/nouveau_drv.h | 6 +++++
drivers/gpu/drm/nouveau/nouveau_irq.c | 1 +
drivers/gpu/drm/nouveau/nv50_display.c | 35 +++++++++++++++++++++++--------
3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index b1be617..c926d88 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -531,6 +531,12 @@ struct drm_nouveau_private {
struct work_struct irq_work;
struct work_struct hpd_work;
+ struct {
+ spinlock_t lock;
+ uint32_t hpd0_bits;
+ uint32_t hpd1_bits;
+ } hpd_state;
+
struct list_head vbl_waiting;
struct {
diff --git a/drivers/gpu/drm/nouveau/nouveau_irq.c b/drivers/gpu/drm/nouveau/nouveau_irq.c
index 794b0ee..b62a601 100644
--- a/drivers/gpu/drm/nouveau/nouveau_irq.c
+++ b/drivers/gpu/drm/nouveau/nouveau_irq.c
@@ -52,6 +52,7 @@ nouveau_irq_preinstall(struct drm_device *dev)
if (dev_priv->card_type >= NV_50) {
INIT_WORK(&dev_priv->irq_work, nv50_display_irq_handler_bh);
INIT_WORK(&dev_priv->hpd_work, nv50_display_irq_hotplug_bh);
+ spin_lock_init(&dev_priv->hpd_state.lock);
INIT_LIST_HEAD(&dev_priv->vbl_waiting);
}
}
diff --git a/drivers/gpu/drm/nouveau/nv50_display.c b/drivers/gpu/drm/nouveau/nv50_display.c
index 83a7d27..014f69c 100644
--- a/drivers/gpu/drm/nouveau/nv50_display.c
+++ b/drivers/gpu/drm/nouveau/nv50_display.c
@@ -1012,11 +1012,18 @@ nv50_display_irq_hotplug_bh(struct work_struct *work)
struct drm_connector *connector;
const uint32_t gpio_reg[4] = { 0xe104, 0xe108, 0xe280, 0xe284 };
uint32_t unplug_mask, plug_mask, change_mask;
- uint32_t hpd0, hpd1 = 0;
+ uint32_t hpd0, hpd1;
- hpd0 = nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL) & nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_INTR);
+ spin_lock_irq(&dev_priv->hpd_state.lock);
+ hpd0 = dev_priv->hpd_state.hpd0_bits;
+ dev_priv->hpd_state.hpd0_bits = 0;
+ hpd1 = dev_priv->hpd_state.hpd1_bits;
+ dev_priv->hpd_state.hpd1_bits = 0;
+ spin_unlock_irq(&dev_priv->hpd_state.lock);
+
+ hpd0 &= nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_INTR);
if (dev_priv->chipset >= 0x90)
- hpd1 = nv_rd32(dev, 0xe074) & nv_rd32(dev, 0xe070);
+ hpd1 &= nv_rd32(dev, 0xe070);
plug_mask = (hpd0 & 0x0000ffff) | (hpd1 << 16);
unplug_mask = (hpd0 >> 16) | (hpd1 & 0xffff0000);
@@ -1058,10 +1065,6 @@ nv50_display_irq_hotplug_bh(struct work_struct *work)
helper->dpms(connector->encoder, DRM_MODE_DPMS_OFF);
}
- nv_wr32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL, nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL));
- if (dev_priv->chipset >= 0x90)
- nv_wr32(dev, 0xe074, nv_rd32(dev, 0xe074));
-
drm_helper_hpd_irq_event(dev);
}
@@ -1072,8 +1075,22 @@ nv50_display_irq_handler(struct drm_device *dev)
uint32_t delayed = 0;
if (nv_rd32(dev, NV50_PMC_INTR_0) & NV50_PMC_INTR_0_HOTPLUG) {
- if (!work_pending(&dev_priv->hpd_work))
- queue_work(dev_priv->wq, &dev_priv->hpd_work);
+ uint32_t hpd0_bits, hpd1_bits = 0;
+
+ hpd0_bits = nv_rd32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL);
+ nv_wr32(dev, NV50_PCONNECTOR_HOTPLUG_CTRL, hpd0_bits);
+
+ if (dev_priv->chipset >= 0x90) {
+ hpd1_bits = nv_rd32(dev, 0xe074);
+ nv_wr32(dev, 0xe074, hpd1_bits);
+ }
+
+ spin_lock(&dev_priv->hpd_state.lock);
+ dev_priv->hpd_state.hpd0_bits |= hpd0_bits;
+ dev_priv->hpd_state.hpd1_bits |= hpd1_bits;
+ spin_unlock(&dev_priv->hpd_state.lock);
+
+ queue_work(dev_priv->wq, &dev_priv->hpd_work);
}
while (nv_rd32(dev, NV50_PMC_INTR_0) & NV50_PMC_INTR_0_DISPLAY) {
--
1.7.3.2
On Wed, Nov 10, 2010 at 6:04 PM, Andy Lutomirski <[email protected]> wrote:
> Nouveau takes down my system quite reliably when any hotplug event occurs.
> The bug happens because the IRQ handler didn't acknowledge the hotplug
> state until the bottom half, so the card generated a new interrupt
> immediately, starving the bottom half and permanently starving that CPU
> (and hence the bottom half).
>
> Even with this fix, a lot of the IRQ code looks rather broken.
>
> This is tested on 2.6.36 (and makes the system stable for me), but it also
> applies cleanly to 2.6.37 (untested, but surely also necessary). ?Fedora 14's
> 2.6.35 kernels seem to have to same problem for me, so I suspect that 2.6.35
> needs this fix as well. ?(All of my tests are on an NV50 card.)
>
> Changes from v1:
> ?- Ignore unrequested hotplug bits (I accidentally removed that part).
> ?- Support newer hardware (untested -- Ben, can you check this?)
Just a quick ping: is this making its way to Linus (and stable)? I've
been running it for five days through (literally, due to monitor bugs)
thousands of plug/unplug cycles with no ill effects.
(Can we *please* get rid of, or at least ratelimit, the
plugged/unplugged printk? It's taking over my logs, and I'm almost
certain that it's not a driver bug.)
--Andy
On Tue, 2010-11-16 at 17:19 -0500, Andrew Lutomirski wrote:
> On Wed, Nov 10, 2010 at 6:04 PM, Andy Lutomirski <[email protected]> wrote:
> > Nouveau takes down my system quite reliably when any hotplug event occurs.
> > The bug happens because the IRQ handler didn't acknowledge the hotplug
> > state until the bottom half, so the card generated a new interrupt
> > immediately, starving the bottom half and permanently starving that CPU
> > (and hence the bottom half).
> >
> > Even with this fix, a lot of the IRQ code looks rather broken.
> >
> > This is tested on 2.6.36 (and makes the system stable for me), but it also
> > applies cleanly to 2.6.37 (untested, but surely also necessary). Fedora 14's
> > 2.6.35 kernels seem to have to same problem for me, so I suspect that 2.6.35
> > needs this fix as well. (All of my tests are on an NV50 card.)
> >
> > Changes from v1:
> > - Ignore unrequested hotplug bits (I accidentally removed that part).
> > - Support newer hardware (untested -- Ben, can you check this?)
>
> Just a quick ping: is this making its way to Linus (and stable)? I've
> been running it for five days through (literally, due to monitor bugs)
> thousands of plug/unplug cycles with no ill effects.
This issue has been fixed in nouveau git now, but that fix can't be
pulled into stable/linus as it depends on architectural changes to
nouveau that Linus probably wouldn't accept this late.
I responded to a mail asking that the patches be redone to just fix the
bug *without* removing the "magic numbers" (so, just patch 2/2
essentially), to avoid more unnecessary conflicts with nouveau git.
Ben.
>
> (Can we *please* get rid of, or at least ratelimit, the
> plugged/unplugged printk? It's taking over my logs, and I'm almost
> certain that it's not a driver bug.)
>
> --Andy