[PATCH] ppc64: escape hatch for spinning interrupt deadlocks

08-eeh-spin-counter.patch One an EEH event is triggers, all further I/O to a device is blocked (until reset). Bad device drivers may end up spinning in their interrupt handlers, trying to read an interrupt status register that will never change state. This patch moves that spin counter to a per-device structure, and adds some diagnostic prints to help locate the bad driver. Signed-off-by: Linas Vepstas <linas@linas.org> Signed-off-by: Paul Mackerras <paulus@samba.org>
author: Linas Vepstas <linas@linas.org> 2005-11-03 18:49:31 -0600
committer: Paul Mackerras <paulus@samba.org> 2005-11-10 11:33:33 +1100
commit: 5c1344e988c716470b5609708215dd7f135f0e3a (patch)
tree: b7f736919ffa6b6e07e5b6bc950a4e36cd0d5547 /arch/ppc64
parent: fd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (diff)
1 files changed, 13 insertions, 8 deletions
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index e7522f6da69..0c52c2de92e 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -78,14 +78,12 @@ DECLARE_WORK(eeh_event_wq, eeh_event_handler, NULL);
 
 static struct notifier_block *eeh_notifier_chain;
 
-/*
- * If a device driver keeps reading an MMIO register in an interrupt
+/* If a device driver keeps reading an MMIO register in an interrupt
  * handler after a slot isolation event has occurred, we assume it
  * is broken and panic.  This sets the threshold for how many read
  * attempts we allow before panicking.
  */
-#define EEH_MAX_FAILS	1000
-static atomic_t eeh_fail_count;
+#define EEH_MAX_FAILS	100000
 
 /* RTAS tokens */
 static int ibm_set_eeh_option;
@@ -521,7 +519,6 @@ static void eeh_event_handler(void *dummy)
 		       "%s\n", event->reset_state,
 		       pci_name(event->dev));
 
-		atomic_set(&eeh_fail_count, 0);
 		notifier_call_chain (&eeh_notifier_chain,
 				     EEH_NOTIFY_FREEZE, event);
 
@@ -657,12 +654,18 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	spin_lock_irqsave(&confirm_error_lock, flags);
 	rc = 1;
 	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
-		atomic_inc(&eeh_fail_count);
-		if (atomic_read(&eeh_fail_count) >= EEH_MAX_FAILS) {
+		pdn->eeh_check_count ++;
+		if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
+			printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
+			        pdn->eeh_check_count);
+			dump_stack();
+			
 			/* re-read the slot reset state */
 			if (read_slot_reset_state(pdn, rets) != 0)
 				rets[0] = -1;	/* reset state unknown */
-			eeh_panic(dev, rets[0]);
+
+			/* If we are here, then we hit an infinite loop. Stop. */
+			panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
 		}
 		goto dn_unlock;
 	}
@@ -808,6 +811,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 	struct pci_dn *pdn = PCI_DN(dn);
 
 	pdn->eeh_mode = 0;
+	pdn->eeh_check_count = 0;
+	pdn->eeh_freeze_count = 0;
 
 	if (status && strcmp(status, "ok") != 0)
 		return NULL;	/* ignore devices with bad status */
author	Linas Vepstas <linas@linas.org>	2005-11-03 18:49:31 -0600
committer	Paul Mackerras <paulus@samba.org>	2005-11-10 11:33:33 +1100
commit	5c1344e988c716470b5609708215dd7f135f0e3a (patch)
tree	b7f736919ffa6b6e07e5b6bc950a4e36cd0d5547 /arch/ppc64
parent	fd761fd876e4d1c0d07b6d93bc45c999fa596cb0 (diff)