[SPARC64]: More SUN4V cpu mondo bug fixing.

This cpu mondo sending interface isn't all that easy to use correctly... We were clearing out the wrong bits from the "mask" after getting something other than EOK from the hypervisor. It turns out the hypervisor can just be resent the same cpu_list[] array, with the 0xffff "done" entries still in there, and it will do the right thing. So don't update or try to rebuild the cpu_list[] array to condense it. This requires the "forward_progress" check to be done slightly differently, but this new scheme is less bug prone than what we were doing before. Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@sunset.davemloft.net> 2006-03-02 21:50:47 -0800
committer: David S. Miller <davem@sunset.davemloft.net> 2006-03-20 01:14:17 -0800
commit: 3cab0c3e8636d5005041aa52224f796c3a4ef872 (patch)
tree: 582c92940f46cb0ecf8fafd4fde1cfd346172366 /arch/sparc64/kernel
parent: bcc28ee0bf390df0d81cc9dafe980faef6b2771a (diff)
1 files changed, 24 insertions, 16 deletions
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 6bc7fd47e44..c4548a88953 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -563,7 +563,7 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 	u64 *mondo;
 	cpumask_t error_mask;
 	unsigned long flags, status;
-	int cnt, retries, this_cpu, i;
+	int cnt, retries, this_cpu, prev_sent, i;
 
 	/* We have to do this whole thing with interrupts fully disabled.
 	 * Otherwise if we send an xcall from interrupt context it will
@@ -595,8 +595,9 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 
 	cpus_clear(error_mask);
 	retries = 0;
+	prev_sent = 0;
 	do {
-		int forward_progress;
+		int forward_progress, n_sent;
 
 		status = sun4v_cpu_mondo_send(cnt,
 					      tb->cpu_list_pa,
@@ -606,18 +607,23 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 		if (likely(status == HV_EOK))
 			break;
 
-		/* First, clear out all the cpus in the mask that were
-		 * successfully sent to.  The hypervisor indicates this
-		 * by setting the cpu list entry of such cpus to 0xffff.
+		/* First, see if we made any forward progress.
+		 *
+		 * The hypervisor indicates successful sends by setting
+		 * cpu list entries to the value 0xffff.
 		 */
-		forward_progress = 0;
+		n_sent = 0;
 		for (i = 0; i < cnt; i++) {
-			if (cpu_list[i] == 0xffff) {
-				cpu_clear(i, mask);
-				forward_progress = 1;
-			}
+			if (likely(cpu_list[i] == 0xffff))
+				n_sent++;
 		}
 
+		forward_progress = 0;
+		if (n_sent > prev_sent)
+			forward_progress = 1;
+
+		prev_sent = n_sent;
+
 		/* If we get a HV_ECPUERROR, then one or more of the cpus
 		 * in the list are in error state.  Use the cpu_state()
 		 * hypervisor call to find out which cpus are in error state.
@@ -634,18 +640,20 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 				err = sun4v_cpu_state(cpu);
 				if (err >= 0 &&
 				    err == HV_CPU_STATE_ERROR) {
-					cpu_clear(cpu, mask);
+					cpu_list[i] = 0xffff;
 					cpu_set(cpu, error_mask);
 				}
 			}
 		} else if (unlikely(status != HV_EWOULDBLOCK))
 			goto fatal_mondo_error;
 
-		/* Rebuild the cpu_list[] array and try again.  */
-		cnt = 0;
-		for_each_cpu_mask(i, mask)
-			cpu_list[cnt++] = i;
-
+		/* Don't bother rewriting the CPU list, just leave the
+		 * 0xffff and non-0xffff entries in there and the
+		 * hypervisor will do the right thing.
+		 *
+		 * Only advance timeout state if we didn't make any
+		 * forward progress.
+		 */
 		if (unlikely(!forward_progress)) {
 			if (unlikely(++retries > 10000))
 				goto fatal_mondo_timeout;
author	David S. Miller <davem@sunset.davemloft.net>	2006-03-02 21:50:47 -0800
committer	David S. Miller <davem@sunset.davemloft.net>	2006-03-20 01:14:17 -0800
commit	3cab0c3e8636d5005041aa52224f796c3a4ef872 (patch)
tree	582c92940f46cb0ecf8fafd4fde1cfd346172366 /arch/sparc64/kernel
parent	bcc28ee0bf390df0d81cc9dafe980faef6b2771a (diff)