9 files changed, 300 insertions, 77 deletions
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 8d17d28ef91..355593dd8ef 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -307,8 +307,11 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	if (past->type != DLM_AST &&
 	    past->type != DLM_BAST) {
-		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
-		     "name=%.*s\n", past->type, cookie, locklen, name);
+		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
+		     "name=%.*s\n", past->type, 
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
+		     locklen, name);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
@@ -316,9 +319,11 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
 	res = dlm_lookup_lockres(dlm, name, locklen);
 	if (!res) {
 		mlog(ML_ERROR, "got %sast for unknown lockres! "
-			       "cookie=%"MLFu64", name=%.*s, namelen=%u\n",
+			       "cookie=%u:%llu, name=%.*s, namelen=%u\n",
 		     past->type == DLM_AST ? "" : "b",
-		     cookie, locklen, name, locklen);
+		     dlm_get_lock_cookie_node(cookie),
+		     dlm_get_lock_cookie_seq(cookie),
+		     locklen, name, locklen);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
@@ -360,9 +365,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
 			goto do_ast;
 	}
 
-	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%"MLFu64", "
-		       "name=%.*s, namelen=%u\n",
-             past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
+	mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%u:%llu, "
+		       "name=%.*s, namelen=%u\n", 
+		       past->type == DLM_AST ? "" : "b", 
+		       dlm_get_lock_cookie_node(cookie),
+		       dlm_get_lock_cookie_seq(cookie),
+		       locklen, name, locklen);
 
 	ret = DLM_NORMAL;
 unlock_out:
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 9c772583744..88cc43df18f 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -630,6 +630,21 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
 	return status;
 }
 
+static inline u8 dlm_get_lock_cookie_node(u64 cookie)
+{
+	u8 ret;
+	cookie >>= 56;
+	ret = (u8)(cookie & 0xffULL);
+	return ret;
+}
+
+static inline unsigned long long dlm_get_lock_cookie_seq(u64 cookie)
+{
+	unsigned long long ret;
+	ret = ((unsigned long long)cookie) & 0x00ffffffffffffffULL;
+	return ret;
+}
+
 struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 			       struct dlm_lockstatus *lksb);
 void dlm_lock_get(struct dlm_lock *lock);
@@ -658,6 +673,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm);
 int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
 
@@ -762,6 +778,11 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master);
+int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 *real_master);
+
 
 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 			       struct dlm_lock_resource *res,
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index f66e2d818cc..8285228d9e3 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -284,8 +284,10 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 	if (lock->ml.convert_type != LKM_IVMODE) {
 		__dlm_print_one_lock_resource(res);
 		mlog(ML_ERROR, "converting a remote lock that is already "
-		     "converting! (cookie=%"MLFu64", conv=%d)\n",
-		     lock->ml.cookie, lock->ml.convert_type);
+		     "converting! (cookie=%u:%llu, conv=%d)\n",
+		     dlm_get_lock_cookie_node(lock->ml.cookie),
+		     dlm_get_lock_cookie_seq(lock->ml.cookie),
+		     lock->ml.convert_type);
 		status = DLM_DENIED;
 		goto bail;
 	}
@@ -513,8 +515,9 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 leave:
 	if (!lock)
 		mlog(ML_ERROR, "did not find lock to convert on grant queue! "
-			       "cookie=%"MLFu64"\n",
-		     cnv->cookie);
+			       "cookie=%u:%llu\n",
+			       dlm_get_lock_cookie_node(cnv->cookie),
+			       dlm_get_lock_cookie_seq(cnv->cookie));
 	else
 		dlm_lock_put(lock);
 
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 54f61b76ab5..c7eae5d3324 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -72,8 +72,10 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 		lock = list_entry(iter2, struct dlm_lock, list);
 		spin_lock(&lock->spinlock);
 		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
+		       dlm_get_lock_cookie_node(lock->ml.cookie), 
+		       dlm_get_lock_cookie_seq(lock->ml.cookie), 
 		       list_empty(&lock->ast_list) ? 'y' : 'n',
 		       lock->ast_pending ? 'y' : 'n',
 		       list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -85,8 +87,10 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 		lock = list_entry(iter2, struct dlm_lock, list);
 		spin_lock(&lock->spinlock);
 		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
+		       dlm_get_lock_cookie_node(lock->ml.cookie), 
+		       dlm_get_lock_cookie_seq(lock->ml.cookie), 
 		       list_empty(&lock->ast_list) ? 'y' : 'n',
 		       lock->ast_pending ? 'y' : 'n',
 		       list_empty(&lock->bast_list) ? 'y' : 'n',
@@ -98,8 +102,10 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 		lock = list_entry(iter2, struct dlm_lock, list);
 		spin_lock(&lock->spinlock);
 		mlog(ML_NOTICE, "    type=%d, conv=%d, node=%u, "
-		       "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
-		       lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, 
+		       "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", 
+		       lock->ml.type, lock->ml.convert_type, lock->ml.node, 
+		       dlm_get_lock_cookie_node(lock->ml.cookie), 
+		       dlm_get_lock_cookie_seq(lock->ml.cookie), 
 		       list_empty(&lock->ast_list) ? 'y' : 'n',
 		       lock->ast_pending ? 'y' : 'n',
 		       list_empty(&lock->bast_list) ? 'y' : 'n',
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index dd2d24dc25e..7e88e24b347 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -596,7 +596,8 @@ static int __init init_dlmfs_fs(void)
 
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
-				0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
+				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+					SLAB_MEM_SPREAD),
 				dlmfs_init_once, NULL);
 	if (!dlmfs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 671d4ff222c..6fea28318d6 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -141,13 +141,23 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
 					  res->lockname.len)) {
 			kick_thread = 1;
 			call_ast = 1;
+		} else {
+			mlog(0, "%s: returning DLM_NORMAL to "
+			     "node %u for reco lock\n", dlm->name,
+			     lock->ml.node);
 		}
 	} else {
 		/* for NOQUEUE request, unless we get the
 		 * lock right away, return DLM_NOTQUEUED */
-		if (flags & LKM_NOQUEUE)
+		if (flags & LKM_NOQUEUE) {
 			status = DLM_NOTQUEUED;
-		else {
+			if (dlm_is_recovery_lock(res->lockname.name,
+						 res->lockname.len)) {
+				mlog(0, "%s: returning NOTQUEUED to "
+				     "node %u for reco lock\n", dlm->name,
+				     lock->ml.node);
+			}
+		} else {
 			dlm_lock_get(lock);
 			list_add_tail(&lock->list, &res->blocked);
 			kick_thread = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 847dd3cc4cf..940be4c13b1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
 				       struct dlm_lock_resource *res,
 				       u8 target);
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res);
 
 
 int dlm_is_host_down(int errno)
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 	struct dlm_node_iter iter;
 	unsigned int namelen;
 	int tries = 0;
+	int bit, wait_on_recovery = 0;
 
 	BUG_ON(!lockid);
 
@@ -762,6 +765,18 @@ lookup:
 		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
 		set_bit(dlm->node_num, mle->maybe_map);
 		list_add(&mle->list, &dlm->master_list);
+
+		/* still holding the dlm spinlock, check the recovery map
+		 * to see if there are any nodes that still need to be 
+		 * considered.  these will not appear in the mle nodemap
+		 * but they might own this lockres.  wait on them. */
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+			     "recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		}
 	}
 
 	/* at this point there is either a DLM_MLE_BLOCK or a
@@ -779,6 +794,39 @@ lookup:
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
+	while (wait_on_recovery) {
+		/* any cluster changes that occurred after dropping the
+		 * dlm spinlock would be detectable be a change on the mle,
+		 * so we only need to clear out the recovery map once. */
+		if (dlm_is_recovery_lock(lockid, namelen)) {
+			mlog(ML_NOTICE, "%s: recovery map is not empty, but "
+			     "must master $RECOVERY lock now\n", dlm->name);
+			if (!dlm_pre_master_reco_lockres(dlm, res))
+				wait_on_recovery = 0;
+			else {
+				mlog(0, "%s: waiting 500ms for heartbeat state "
+				    "change\n", dlm->name);
+				msleep(500);
+			}
+			continue;
+		} 
+
+		dlm_kick_recovery_thread(dlm);
+		msleep(100);
+		dlm_wait_for_recovery(dlm);
+
+		spin_lock(&dlm->spinlock);
+		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
+		if (bit < O2NM_MAX_NODES) {
+			mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
+			     "recover before lock mastery can begin\n",
+			     dlm->name, namelen, (char *)lockid, bit);
+			wait_on_recovery = 1;
+		} else
+			wait_on_recovery = 0;
+		spin_unlock(&dlm->spinlock);
+	}
+
 	/* must wait for lock to be mastered elsewhere */
 	if (blocked)
 		goto wait;
@@ -792,7 +840,15 @@ redo_request:
 			mlog_errno(ret);
 		if (mle->master != O2NM_MAX_NODES) {
 			/* found a master ! */
-			break;
+			if (mle->master <= nodenum)
+				break;
+			/* if our master request has not reached the master
+			 * yet, keep going until it does.  this is how the
+			 * master will know that asserts are needed back to
+			 * the lower nodes. */
+			mlog(0, "%s:%.*s: requests only up to %u but master "
+			     "is %u, keep going\n", dlm->name, namelen,
+			     lockid, nodenum, mle->master);
 		}
 	}
 
@@ -860,7 +916,19 @@ recheck:
 	/* check if another node has already become the owner */
 	spin_lock(&res->spinlock);
 	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
+		mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
+		     res->lockname.len, res->lockname.name, res->owner);
 		spin_unlock(&res->spinlock);
+		/* this will cause the master to re-assert across
+		 * the whole cluster, freeing up mles */
+		ret = dlm_do_master_request(mle, res->owner);
+		if (ret < 0) {
+			/* give recovery a chance to run */
+			mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+			msleep(500);
+			goto recheck;
+		}
+		ret = 0;
 		goto leave;
 	}
 	spin_unlock(&res->spinlock);
@@ -1244,13 +1312,14 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 {
 	u8 response = DLM_MASTER_RESP_MAYBE;
 	struct dlm_ctxt *dlm = data;
-	struct dlm_lock_resource *res;
+	struct dlm_lock_resource *res = NULL;
 	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
 	char *name;
 	unsigned int namelen;
 	int found, ret;
 	int set_maybe;
+	int dispatch_assert = 0;
 
 	if (!dlm_grab(dlm))
 		return DLM_MASTER_RESP_NO;
@@ -1287,7 +1356,6 @@ way_up_top:
 		}
 
 		if (res->owner == dlm->node_num) {
-			u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP;
 			spin_unlock(&res->spinlock);
 			// mlog(0, "this node is the master\n");
 			response = DLM_MASTER_RESP_YES;
@@ -1300,16 +1368,7 @@ way_up_top:
 			 * caused all nodes up to this one to
 			 * create mles.  this node now needs to
 			 * go back and clean those up. */
-			mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
-			     dlm->node_num, res->lockname.len, res->lockname.name);
-			ret = dlm_dispatch_assert_master(dlm, res, 1,
-							 request->node_idx,
-							 flags);
-			if (ret < 0) {
-				mlog(ML_ERROR, "failed to dispatch assert "
-				     "master work\n");
-				response = DLM_MASTER_RESP_ERROR;
-			}
+			dispatch_assert = 1;
 			goto send_response;
 		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
 			spin_unlock(&res->spinlock);
@@ -1357,9 +1416,13 @@ way_up_top:
 			}
 		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
 			set_maybe = 0;
-			if (tmpmle->master == dlm->node_num)
+			if (tmpmle->master == dlm->node_num) {
 				response = DLM_MASTER_RESP_YES;
-			else
+				/* this node will be the owner.
+				 * go back and clean the mles on any
+				 * other nodes */
+				dispatch_assert = 1;
+			} else
 				response = DLM_MASTER_RESP_NO;
 		} else {
 			// mlog(0, "this node is attempting to "
@@ -1398,8 +1461,8 @@ way_up_top:
 			mle = (struct dlm_master_list_entry *)
 				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
 			if (!mle) {
-				// bad bad bad... this sucks.
 				response = DLM_MASTER_RESP_ERROR;
+				mlog_errno(-ENOMEM);
 				goto send_response;
 			}
 			spin_lock(&dlm->spinlock);
@@ -1418,25 +1481,19 @@ way_up_top:
 		// mlog(0, "mle was found\n");
 		set_maybe = 1;
 		spin_lock(&tmpmle->spinlock);
+		if (tmpmle->master == dlm->node_num) {
+			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
+			BUG();
+		}
 		if (tmpmle->type == DLM_MLE_BLOCK)
 			response = DLM_MASTER_RESP_NO;
 		else if (tmpmle->type == DLM_MLE_MIGRATION) {
 			mlog(0, "migration mle was found (%u->%u)\n",
 			     tmpmle->master, tmpmle->new_master);
-			if (tmpmle->master == dlm->node_num) {
-				mlog(ML_ERROR, "no lockres, but migration mle "
-				     "says that this node is master!\n");
-				BUG();
-			}
 			/* real master can respond on its own */
 			response = DLM_MASTER_RESP_NO;
-		} else {
-			if (tmpmle->master == dlm->node_num) {
-				response = DLM_MASTER_RESP_YES;
-				set_maybe = 0;
-			} else
-				response = DLM_MASTER_RESP_MAYBE;
-		}
+		} else
+			response = DLM_MASTER_RESP_MAYBE;
 		if (set_maybe)
 			set_bit(request->node_idx, tmpmle->maybe_map);
 		spin_unlock(&tmpmle->spinlock);
@@ -1449,6 +1506,24 @@ way_up_top:
 		dlm_put_mle(tmpmle);
 	}
 send_response:
+
+	if (dispatch_assert) {
+		if (response != DLM_MASTER_RESP_YES)
+			mlog(ML_ERROR, "invalid response %d\n", response);
+		if (!res) {
+			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
+			BUG();
+		}
+		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
+			     dlm->node_num, res->lockname.len, res->lockname.name);
+		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 
+						 DLM_ASSERT_MASTER_MLE_CLEANUP);
+		if (ret < 0) {
+			mlog(ML_ERROR, "failed to dispatch assert master work\n");
+			response = DLM_MASTER_RESP_ERROR;
+		}
+	}
+
 	dlm_put(dlm);
 	return response;
 }
@@ -1471,8 +1546,11 @@ static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
 	int to, tmpret;
 	struct dlm_node_iter iter;
 	int ret = 0;
+	int reassert;
 
 	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
+again:
+	reassert = 0;
 
 	/* note that if this nodemap is empty, it returns 0 */
 	dlm_node_iter_init(nodemap, &iter);
@@ -1504,9 +1582,17 @@ static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
 			     "got %d.\n", namelen, lockname, to, r);
 			dlm_dump_lock_resources(dlm);
 			BUG();
+		} else if (r == EAGAIN) {
+			mlog(0, "%.*s: node %u create mles on other "
+			     "nodes and requests a re-assert\n", 
+			     namelen, lockname, to);
+			reassert = 1;
 		}
 	}
 
+	if (reassert)
+		goto again;
+
 	return ret;
 }
 
@@ -1528,6 +1614,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 	char *name;
 	unsigned int namelen;
 	u32 flags;
+	int master_request = 0;
+	int ret = 0;
 
 	if (!dlm_grab(dlm))
 		return 0;
@@ -1642,11 +1730,22 @@ ok:
 	// mlog(0, "woo!  got an assert_master from node %u!\n",
 	// 	     assert->node_idx);
 	if (mle) {
-		int extra_ref;
+		int extra_ref = 0;
+		int nn = -1;
 		
 		spin_lock(&mle->spinlock);
-		extra_ref = !!(mle->type == DLM_MLE_BLOCK
-			       || mle->type == DLM_MLE_MIGRATION);
+		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
+			extra_ref = 1;
+		else {
+			/* MASTER mle: if any bits set in the response map
+			 * then the calling node needs to re-assert to clear
+			 * up nodes that this node contacted */
+			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 
+						    nn+1)) < O2NM_MAX_NODES) {
+				if (nn != dlm->node_num && nn != assert->node_idx)
+					master_request = 1;
+			}
+		}
 		mle->master = assert->node_idx;
 		atomic_set(&mle->woken, 1);
 		wake_up(&mle->wq);
@@ -1677,10 +1776,15 @@ ok:
 	}
 
 done:
+	ret = 0;
 	if (res)
 		dlm_lockres_put(res);
 	dlm_put(dlm);
-	return 0;
+	if (master_request) {
+		mlog(0, "need to tell master to reassert\n");
+		ret = EAGAIN;  // positive. negative would shoot down the node.
+	}
+	return ret;
 
 kill:
 	/* kill the caller! */
@@ -1713,6 +1817,10 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 	item->u.am.request_from = request_from;
 	item->u.am.flags = flags;
 
+	if (ignore_higher) 
+		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 
+		     res->lockname.name);
+		
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
@@ -1775,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 	mlog(0, "finished with dlm_assert_master_worker\n");
 }
 
+/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
+ * We cannot wait for node recovery to complete to begin mastering this
+ * lockres because this lockres is used to kick off recovery! ;-)
+ * So, do a pre-check on all living nodes to see if any of those nodes
+ * think that $RECOVERY is currently mastered by a dead node.  If so,
+ * we wait a short time to allow that node to get notified by its own
+ * heartbeat stack, then check again.  All $RECOVERY lock resources
+ * mastered by dead nodes are purged when the hearbeat callback is 
+ * fired, so we can know for sure that it is safe to continue once
+ * the node returns a live node or no node.  */
+static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
+				       struct dlm_lock_resource *res)
+{
+	struct dlm_node_iter iter;
+	int nodenum;
+	int ret = 0;
+	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
+
+	spin_lock(&dlm->spinlock);
+	dlm_node_iter_init(dlm->domain_map, &iter);
+	spin_unlock(&dlm->spinlock);
+
+	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
+		/* do not send to self */
+		if (nodenum == dlm->node_num)
+			continue;
+		ret = dlm_do_master_requery(dlm, res, nodenum, &master);
+		if (ret < 0) {
+			mlog_errno(ret);
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+		}
+
+		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
+			/* check to see if this master is in the recovery map */
+			spin_lock(&dlm->spinlock);
+			if (test_bit(master, dlm->recovery_map)) {
+				mlog(ML_NOTICE, "%s: node %u has not seen "
+				     "node %u go down yet, and thinks the "
+				     "dead node is mastering the recovery "
+				     "lock.  must wait.\n", dlm->name,
+				     nodenum, master);
+				ret = -EAGAIN;
+			}
+			spin_unlock(&dlm->spinlock);
+			mlog(0, "%s: reco lock master is %u\n", dlm->name, 
+			     master);
+			break;
+		}
+	}
+	return ret;
+}
+
 
 /*
  * DLM_MIGRATE_LOCKRES
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 1e232000f3f..805cbabac05 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -58,7 +58,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
 static int dlm_recovery_thread(void *data);
 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
-static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 static int dlm_do_recovery(struct dlm_ctxt *dlm);
 
 static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
@@ -78,15 +78,9 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
 				    u8 send_to,
 				    struct dlm_lock_resource *res,
 				    int total_locks);
-static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-				      struct dlm_lock_resource *res,
-				      u8 *real_master);
 static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 				     struct dlm_lock_resource *res,
 				     struct dlm_migratable_lockres *mres);
-static int dlm_do_master_requery(struct dlm_ctxt *dlm,
-				 struct dlm_lock_resource *res,
-				 u8 nodenum, u8 *real_master);
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
 static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
 				 u8 dead_node, u8 send_to);
@@ -165,7 +159,7 @@ void dlm_dispatch_work(void *data)
  * RECOVERY THREAD
  */
 
-static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
+void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
 {
 	/* wake the recovery thread
 	 * this will wake the reco thread in one of three places
@@ -750,10 +744,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 		     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
 		     dead_node, reco_master);
 		mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-		     "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
+		     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
 		     dlm->name, mres->lockname_len, mres->lockname, mres->master,
 		     mres->num_locks, mres->total_locks, mres->flags,
-		     mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
+		     dlm_get_lock_cookie_node(mres->ml[0].cookie),
+		     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
+		     mres->ml[0].list, mres->ml[0].flags,
 		     mres->ml[0].type, mres->ml[0].convert_type,
 		     mres->ml[0].highest_blocked, mres->ml[0].node);
 		BUG();
@@ -1316,9 +1312,8 @@ leave:
 
 
 
-static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-				      struct dlm_lock_resource *res,
-				      u8 *real_master)
+int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+			       struct dlm_lock_resource *res, u8 *real_master)
 {
 	struct dlm_node_iter iter;
 	int nodenum;
@@ -1360,8 +1355,10 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
 		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
 		if (ret < 0) {
 			mlog_errno(ret);
-			BUG();
-			/* TODO: need to figure a way to restart this */
+			if (!dlm_is_host_down(ret))
+				BUG();
+			/* host is down, so answer for that node would be
+			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
 		}
 		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
 			mlog(0, "lock master is %u\n", *real_master);
@@ -1372,9 +1369,8 @@ static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
 }
 
 
-static int dlm_do_master_requery(struct dlm_ctxt *dlm,
-				 struct dlm_lock_resource *res,
-				 u8 nodenum, u8 *real_master)
+int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
+			  u8 nodenum, u8 *real_master)
 {
 	int ret = -EINVAL;
 	struct dlm_master_requery req;
@@ -1519,9 +1515,11 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 			/* lock is always created locally first, and
 			 * destroyed locally last.  it must be on the list */
 			if (!lock) {
+				u64 c = ml->cookie;
 				mlog(ML_ERROR, "could not find local lock "
-					       "with cookie %"MLFu64"!\n",
-				     ml->cookie);
+					       "with cookie %u:%llu!\n",
+					       dlm_get_lock_cookie_node(c),
+					       dlm_get_lock_cookie_seq(c));
 				BUG();
 			}
 			BUG_ON(lock->ml.node != ml->node);
@@ -1739,6 +1737,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 				} else
 					continue;
 
+				if (!list_empty(&res->recovering)) {
+					mlog(0, "%s:%.*s: lockres was "
+					     "marked RECOVERING, owner=%u\n",
+					     dlm->name, res->lockname.len,
+					     res->lockname.name, res->owner);
+					list_del_init(&res->recovering);
+				}
 				spin_lock(&res->spinlock);
 				dlm_change_lockres_owner(dlm, res, new_master);
 				res->state &= ~DLM_LOCK_RES_RECOVERING;
@@ -2258,7 +2263,10 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 			mlog(0, "%u not in domain/live_nodes map "
 			     "so setting it in reco map manually\n",
 			     br->dead_node);
-		set_bit(br->dead_node, dlm->recovery_map);
+		/* force the recovery cleanup in __dlm_hb_node_down
+		 * both of these will be cleared in a moment */
+		set_bit(br->dead_node, dlm->domain_map);
+		set_bit(br->dead_node, dlm->live_nodes_map);
 		__dlm_hb_node_down(dlm, br->dead_node);
 	}
 	spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index c95f08d2e92..7b1a2754267 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -244,8 +244,10 @@ leave:
 	if (actions & DLM_UNLOCK_FREE_LOCK) {
 		/* this should always be coupled with list removal */
 		BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
-		mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
-		     lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
+		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
+		     dlm_get_lock_cookie_node(lock->ml.cookie),
+		     dlm_get_lock_cookie_seq(lock->ml.cookie),
+		     atomic_read(&lock->lock_refs.refcount)-1);
 		dlm_lock_put(lock);
 	}
 	if (actions & DLM_UNLOCK_CALL_AST)
@@ -493,8 +495,9 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 not_found:
 	if (!found)
 		mlog(ML_ERROR, "failed to find lock to unlock! "
-			       "cookie=%"MLFu64"\n",
-		     unlock->cookie);
+			       "cookie=%u:%llu\n",
+			       dlm_get_lock_cookie_node(unlock->cookie),
+			       dlm_get_lock_cookie_seq(unlock->cookie));
 	else {
 		/* send the lksb->status back to the other node */
 		status = lksb->status;