drm/i915: Trigger hangcheck if we detect more a repeating missed IRQ

On the first instance we just wish to kick the waiters and see if that terminates the wait conditions. If it does not, then we do not want to keep retrying without ever making any forward progress and becoming stuck in a hangcheck loop. Reported-and-tested-by: Lukas Hejtmanek <xhejtman@fi.muni.cz> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=48209 Reviewed-by: Ben Widawsky <ben@bwidawsk.net> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
author: Chris Wilson <chris@chris-wilson.co.uk> 2012-04-10 17:00:41 +0100
committer: Daniel Vetter <daniel.vetter@ffwll.ch> 2012-04-15 19:37:37 +0200
commit: d1e61e7fc4456c4cb9a33ed182edf40e34ddedea (patch)
tree: 24210178e48521c970b66c4c673b9cbff9a2bb2a
parent: 56fa6d6ff76c7700f8dd131bee9ffa6c3c06dcd4 (diff)
1 files changed, 38 insertions, 25 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index febddc2952f..39663f51e10 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1875,6 +1875,36 @@ static bool kick_ring(struct intel_ring_buffer *ring)
 	return false;
 }
 
+static bool i915_hangcheck_hung(struct drm_device *dev)
+{
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	if (dev_priv->hangcheck_count++ > 1) {
+		DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
+		i915_handle_error(dev, true);
+
+		if (!IS_GEN2(dev)) {
+			/* Is the chip hanging on a WAIT_FOR_EVENT?
+			 * If so we can simply poke the RB_WAIT bit
+			 * and break the hang. This should work on
+			 * all but the second generation chipsets.
+			 */
+			if (kick_ring(&dev_priv->ring[RCS]))
+				return false;
+
+			if (HAS_BSD(dev) && kick_ring(&dev_priv->ring[VCS]))
+				return false;
+
+			if (HAS_BLT(dev) && kick_ring(&dev_priv->ring[BCS]))
+				return false;
+		}
+
+		return true;
+	}
+
+	return false;
+}
+
 /**
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. The first time this is called we simply record
@@ -1895,9 +1925,14 @@ void i915_hangcheck_elapsed(unsigned long data)
 	if (i915_hangcheck_ring_idle(&dev_priv->ring[RCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[VCS], &err) &&
 	    i915_hangcheck_ring_idle(&dev_priv->ring[BCS], &err)) {
-		dev_priv->hangcheck_count = 0;
-		if (err)
+		if (err) {
+			if (i915_hangcheck_hung(dev))
+				return;
+
 			goto repeat;
+		}
+
+		dev_priv->hangcheck_count = 0;
 		return;
 	}
 
@@ -1919,30 +1954,8 @@ void i915_hangcheck_elapsed(unsigned long data)
 	    dev_priv->last_acthd_blt == acthd_blt &&
 	    dev_priv->last_instdone == instdone &&
 	    dev_priv->last_instdone1 == instdone1) {
-		if (dev_priv->hangcheck_count++ > 1) {
-			DRM_ERROR("Hangcheck timer elapsed... GPU hung\n");
-			i915_handle_error(dev, true);
-
-			if (!IS_GEN2(dev)) {
-				/* Is the chip hanging on a WAIT_FOR_EVENT?
-				 * If so we can simply poke the RB_WAIT bit
-				 * and break the hang. This should work on
-				 * all but the second generation chipsets.
-				 */
-				if (kick_ring(&dev_priv->ring[RCS]))
-					goto repeat;
-
-				if (HAS_BSD(dev) &&
-				    kick_ring(&dev_priv->ring[VCS]))
-					goto repeat;
-
-				if (HAS_BLT(dev) &&
-				    kick_ring(&dev_priv->ring[BCS]))
-					goto repeat;
-			}
-
+		if (i915_hangcheck_hung(dev))
 			return;
-		}
 	} else {
 		dev_priv->hangcheck_count = 0;
author	Chris Wilson <chris@chris-wilson.co.uk>	2012-04-10 17:00:41 +0100
committer	Daniel Vetter <daniel.vetter@ffwll.ch>	2012-04-15 19:37:37 +0200
commit	d1e61e7fc4456c4cb9a33ed182edf40e34ddedea (patch)
tree	24210178e48521c970b66c4c673b9cbff9a2bb2a
parent	56fa6d6ff76c7700f8dd131bee9ffa6c3c06dcd4 (diff)