54 files changed, 2175 insertions, 2019 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c..12e1baa4508 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
  * v9fs_send_request - send 9P request
  * The function can sleep until the request is scheduled for sending.
  * The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
  * that can be retrieved by PTR_ERR macros.
  *
  * @m: mux data
diff --git a/fs/Kconfig b/fs/Kconfig
index 1cdc043922d..6c5051802bd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1490,7 +1490,12 @@ config NFSD
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
-	select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
+	select NFSD_V2_ACL if NFSD_V3_ACL
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	select NFSD_TCP if NFSD_V4
+	select CRYPTO_MD5 if NFSD_V4
+	select CRYPTO if NFSD_V4
+	select FS_POSIX_ACL if NFSD_V4
 	help
 	  If you want your Linux box to act as an NFS *server*, so that other
 	  computers on your local network which support NFS can access certain
@@ -1528,7 +1533,6 @@ config NFSD_V3
 config NFSD_V3_ACL
 	bool "Provide server support for the NFSv3 ACL protocol extension"
 	depends on NFSD_V3
-	select NFSD_V2_ACL
 	help
 	  Implement the NFSv3 ACL protocol extension for manipulating POSIX
 	  Access Control Lists on exported file systems. NFS clients should
@@ -1538,10 +1542,6 @@ config NFSD_V3_ACL
 config NFSD_V4
 	bool "Provide NFSv4 server support (EXPERIMENTAL)"
 	depends on NFSD_V3 && EXPERIMENTAL
-	select NFSD_TCP
-	select CRYPTO_MD5
-	select CRYPTO
-	select FS_POSIX_ACL
 	help
 	  If you would like to include the NFSv4 server as well as the NFSv2
 	  and NFSv3 servers, say Y here.  This feature is experimental, and
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d6..bfc1fd22d5b 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
 
 	/* we found it in the graveyard - resurrect it */
  found_dead_server:
-	list_del(&server->link);
-	list_add_tail(&server->link, &cell->sv_list);
+	list_move_tail(&server->link, &cell->sv_list);
 	afs_get_server(server);
 	afs_kafstimod_del_timer(&server->timeout);
 	spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b..f09a794f248 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
 			if (!list_empty(&kafsasyncd_async_attnq)) {
 				op = list_entry(kafsasyncd_async_attnq.next,
 						struct afs_async_op, link);
-				list_del(&op->link);
-				list_add_tail(&op->link,
+				list_move_tail(&op->link,
 					      &kafsasyncd_async_busyq);
 			}
 
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
 	init_waitqueue_entry(&op->waiter, kafsasyncd_task);
 	add_wait_queue(&op->call->waitq, &op->waiter);
 
-	list_del(&op->link);
-	list_add_tail(&op->link, &kafsasyncd_async_busyq);
+	list_move_tail(&op->link, &kafsasyncd_async_busyq);
 
 	spin_unlock(&kafsasyncd_async_lock);
 
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
 
 	spin_lock(&kafsasyncd_async_lock);
 
-	list_del(&op->link);
-	list_add_tail(&op->link, &kafsasyncd_async_attnq);
+	list_move_tail(&op->link, &kafsasyncd_async_attnq);
 
 	spin_unlock(&kafsasyncd_async_lock);
 
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c..22afaae1a4c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
  resurrect_server:
 	_debug("resurrecting server");
 
-	list_del(&zombie->link);
-	list_add_tail(&zombie->link, &cell->sv_list);
+	list_move_tail(&zombie->link, &cell->sv_list);
 	afs_get_server(zombie);
 	afs_kafstimod_del_timer(&zombie->timeout);
 	spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
 	}
 
 	spin_lock(&cell->sv_gylock);
-	list_del(&server->link);
-	list_add_tail(&server->link, &cell->sv_graveyard);
+	list_move_tail(&server->link, &cell->sv_graveyard);
 
 	/* time out in 10 secs */
 	afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ec..331f730a1fb 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
 	/* found in the graveyard - resurrect */
 	_debug("found in graveyard");
 	atomic_inc(&vlocation->usage);
-	list_del(&vlocation->link);
-	list_add_tail(&vlocation->link, &cell->vl_list);
+	list_move_tail(&vlocation->link, &cell->vl_list);
 	spin_unlock(&cell->vl_gylock);
 
 	afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
 	}
 
 	/* move to graveyard queue */
-	list_del(&vlocation->link);
-	list_add_tail(&vlocation->link,&cell->vl_graveyard);
+	list_move_tail(&vlocation->link,&cell->vl_graveyard);
 
 	/* remove from pending timeout queue (refcounted if actually being
 	 * updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261..cf62da5d782 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
 					vnode->cb_expiry * HZ);
 
 		spin_lock(&afs_cb_hash_lock);
-		list_del(&vnode->cb_hash_link);
-		list_add_tail(&vnode->cb_hash_link,
+		list_move_tail(&vnode->cb_hash_link,
 			      &afs_cb_hash(server, &vnode->fid));
 		spin_unlock(&afs_cb_hash_lock);
 
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d..950630187ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
  *	invoked both for initial i/o submission and
  *	subsequent retries via the aio_kick_handler.
  *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reaquired
+ *	already held. The lock is released and reacquired
  *	as needed during processing.
  *
  * Calls the iocb retry method (already setup for the
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4456d1daa40..8dbd44f10e9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -376,8 +376,7 @@ next:
 		DPRINTK("returning %p %.*s",
 			expired, (int)expired->d_name.len, expired->d_name.name);
 		spin_lock(&dcache_lock);
-		list_del(&expired->d_parent->d_subdirs);
-		list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+		list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
 		spin_unlock(&dcache_lock);
 		return expired;
 	}
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c..c94d52eafd1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		}
 
 		/* OK, This is the point of no return */
-		set_personality(PER_LINUX);
+		set_personality(PER_LINUX_32BIT);
 	}
 
 	/*
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36d..7caee8d8ea3 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
 	/* If request was not a signal, enqueue and don't free */
 	if (!(req->uc_flags & REQ_ASYNC)) {
 		req->uc_flags |= REQ_READ;
-		list_add(&(req->uc_chain), vcp->vc_processing.prev);
+		list_add_tail(&(req->uc_chain), &vcp->vc_processing);
 		goto out;
 	}
 
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b040eba13a7..a5b5e631ba6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
 	((union inputArgs *)buffer)->ih.unique = req->uc_unique;
 
 	/* Append msg to pending queue and poke Venus. */
-	list_add(&(req->uc_chain), vcommp->vc_pending.prev);
+	list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
         
 	wake_up_interruptible(&vcommp->vc_waitq);
 	/* We can be interrupted while we wait for Venus to process
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3c..e31e9cf9664 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
 
 extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
 
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = vprintk(fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
 	sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
 	if (!isprint(buf[1]))
 		sprintf(buf, "%02x", buf[1]);
-	printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+	compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
 			"cmd(%08x){%s} arg(%08x) on %s\n",
 			current->comm, current->pid,
 			(int)fd, (unsigned int)cmd, buf,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9eb9824dd33..d8ecfedef18 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
 #include <net/bluetooth/rfcomm.h>
 
 #include <linux/capi.h>
+#include <linux/gigaset_dev.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc5..207f8006fd6 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 			/* fallthrough */
 		default:
 			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &parent_sd->s_children);
+				list_move(q, &parent_sd->s_children);
 			}
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
 						 dt_type(next)) < 0)
 					return 0;
 
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/fs/dcache.c b/fs/dcache.c
index b85fda36053..48b44a714b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		list_del(tmp);
-		list_add(tmp, &dentry_unused);
+		list_move(tmp, &dentry_unused);
 	}
 
 	/*
@@ -638,7 +637,7 @@ resume:
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			list_add(&dentry->d_lru, dentry_unused.prev);
+			list_add_tail(&dentry->d_lru, &dentry_unused);
 			dentry_stat.nr_unused++;
 			found++;
 		}
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c6..0122a279106 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
 /* Add a dquot to the tail of the free list */
 static inline void put_dquot_last(struct dquot *dquot)
 {
-	list_add(&dquot->dq_free, free_dquots.prev);
+	list_add_tail(&dquot->dq_free, &free_dquots);
 	dqstats.free_dquots++;
 }
 
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
 {
 	/* We add to the back of inuse list so we don't have to restart
 	 * when traversing this list and we block */
-	list_add(&dquot->dq_inuse, inuse_list.prev);
+	list_add_tail(&dquot->dq_inuse, &inuse_list);
 	dqstats.allocated_dquots++;
 }
 
diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf64614..c8494f513ea 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct dentry *proc_dentry1, *proc_dentry2;
-
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
 		 */
 		current->start_time = leader->start_time;
 
-		spin_lock(&leader->proc_lock);
-		spin_lock(&current->proc_lock);
-		proc_dentry1 = proc_pid_unhash(current);
-		proc_dentry2 = proc_pid_unhash(leader);
 		write_lock_irq(&tasklist_lock);
 
 		BUG_ON(leader->tgid != current->tgid);
@@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail_rcu(&current->tasks, &init_task.tasks);
+		list_replace_rcu(&leader->tasks, &current->tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
@@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
 		/* Reduce leader to a thread */
 		detach_pid(leader, PIDTYPE_PGID);
 		detach_pid(leader, PIDTYPE_SID);
-		list_del_init(&leader->tasks);
 
 		current->exit_signal = SIGCHLD;
 
@@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk)
 		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
-		spin_unlock(&leader->proc_lock);
-		spin_unlock(&current->proc_lock);
-		proc_pid_flush(proc_dentry1);
-		proc_pid_flush(proc_dentry2);
         }
 
 	/*
@@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr)
 	*out_ptr = 0;
 }
 
-static void zap_threads (struct mm_struct *mm)
+static void zap_process(struct task_struct *start)
 {
-	struct task_struct *g, *p;
-	struct task_struct *tsk = current;
-	struct completion *vfork_done = tsk->vfork_done;
-	int traced = 0;
+	struct task_struct *t;
 
-	/*
-	 * Make sure nobody is waiting for us to release the VM,
-	 * otherwise we can deadlock when we wait on each other
-	 */
-	if (vfork_done) {
-		tsk->vfork_done = NULL;
-		complete(vfork_done);
-	}
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_stop_count = 0;
 
-	read_lock(&tasklist_lock);
-	do_each_thread(g,p)
-		if (mm == p->mm && p != tsk) {
-			force_sig_specific(SIGKILL, p);
-			mm->core_waiters++;
-			if (unlikely(p->ptrace) &&
-			    unlikely(p->parent->mm == mm))
-				traced = 1;
+	t = start;
+	do {
+		if (t != current && t->mm) {
+			t->mm->core_waiters++;
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
 		}
-	while_each_thread(g,p);
+	} while ((t = next_thread(t)) != start);
+}
 
-	read_unlock(&tasklist_lock);
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+				int exit_code)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int err = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+		tsk->signal->group_exit_code = exit_code;
+		zap_process(tsk);
+		err = 0;
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (err)
+		return err;
 
-	if (unlikely(traced)) {
-		/*
-		 * We are zapping a thread and the thread it ptraces.
-		 * If the tracee went into a ptrace stop for exit tracing,
-		 * we could deadlock since the tracer is waiting for this
-		 * coredump to finish.  Detach them so they can both die.
-		 */
-		write_lock_irq(&tasklist_lock);
-		do_each_thread(g,p) {
-			if (mm == p->mm && p != tsk &&
-			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_detach(p, 0);
+	if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+		goto done;
+
+	rcu_read_lock();
+	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+
+		p = g;
+		do {
+			if (p->mm) {
+				if (p->mm == mm) {
+					/*
+					 * p->sighand can't disappear, but
+					 * may be changed by de_thread()
+					 */
+					lock_task_sighand(p, &flags);
+					zap_process(p);
+					unlock_task_sighand(p, &flags);
+				}
+				break;
 			}
-		} while_each_thread(g,p);
-		write_unlock_irq(&tasklist_lock);
+		} while ((p = next_thread(p)) != g);
 	}
+	rcu_read_unlock();
+done:
+	return mm->core_waiters;
 }
 
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
 {
-	DECLARE_COMPLETION(startup_done);
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	struct completion startup_done;
+	struct completion *vfork_done;
 	int core_waiters;
 
+	init_completion(&mm->core_done);
+	init_completion(&startup_done);
 	mm->core_startup_done = &startup_done;
 
-	zap_threads(mm);
-	core_waiters = mm->core_waiters;
+	core_waiters = zap_threads(tsk, mm, exit_code);
 	up_write(&mm->mmap_sem);
 
+	if (unlikely(core_waiters < 0))
+		goto fail;
+
+	/*
+	 * Make sure nobody is waiting for us to release the VM,
+	 * otherwise we can deadlock when we wait on each other
+	 */
+	vfork_done = tsk->vfork_done;
+	if (vfork_done) {
+		tsk->vfork_done = NULL;
+		complete(vfork_done);
+	}
+
 	if (core_waiters)
 		wait_for_completion(&startup_done);
+fail:
 	BUG_ON(mm->core_waiters);
+	return core_waiters;
 }
 
 int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	}
 	mm->dumpable = 0;
 
-	retval = -EAGAIN;
-	spin_lock_irq(&current->sighand->siglock);
-	if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
-		current->signal->flags = SIGNAL_GROUP_EXIT;
-		current->signal->group_exit_code = exit_code;
-		current->signal->group_stop_count = 0;
-		retval = 0;
-	}
-	spin_unlock_irq(&current->sighand->siglock);
-	if (retval) {
-		up_write(&mm->mmap_sem);
+	retval = coredump_wait(exit_code);
+	if (retval < 0)
 		goto fail;
-	}
-
-	init_completion(&mm->core_done);
-	coredump_wait(mm);
 
 	/*
 	 * Clear any false indication of pending signals that might
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b2891cc29db..b7483360a2d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -630,7 +630,7 @@ enum {
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -666,6 +666,7 @@ static match_table_t tokens = {
 	{Opt_noreservation, "noreservation"},
 	{Opt_noload, "noload"},
 	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
@@ -1014,6 +1015,9 @@ clear_qf_name:
 		case Opt_nobh:
 			set_opt(sbi->s_mount_opt, NOBH);
 			break;
+		case Opt_bh:
+			clear_opt(sbi->s_mount_opt, NOBH);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT3-fs: Unrecognized mount option \"%s\" "
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101..b8886f048ea 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	if (!instr) {
 		printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
 		spin_lock(&c->erase_completion_lock);
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->erasing_size -= c->sector_size;
 		c->dirty_size += c->sector_size;
 		jeb->dirty_size = c->sector_size;
@@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 		/* Erase failed immediately. Refile it on the list */
 		D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
 		spin_lock(&c->erase_completion_lock);
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->erasing_size -= c->sector_size;
 		c->dirty_size += c->sector_size;
 		jeb->dirty_size = c->sector_size;
@@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
 {
 	D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
 	spin_lock(&c->erase_completion_lock);
-	list_del(&jeb->list);
-	list_add_tail(&jeb->list, &c->erase_complete_list);
+	list_move_tail(&jeb->list, &c->erase_complete_list);
 	spin_unlock(&c->erase_completion_lock);
 	/* Ensure that kupdated calls us again to mark them clean */
 	jffs2_erase_pending_trigger(c);
@@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
 			/* We'd like to give this block another try. */
 			spin_lock(&c->erase_completion_lock);
-			list_del(&jeb->list);
-			list_add(&jeb->list, &c->erase_pending_list);
+			list_move(&jeb->list, &c->erase_pending_list);
 			c->erasing_size -= c->sector_size;
 			c->dirty_size += c->sector_size;
 			jeb->dirty_size = c->sector_size;
@@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 	spin_lock(&c->erase_completion_lock);
 	c->erasing_size -= c->sector_size;
 	c->bad_size += c->sector_size;
-	list_del(&jeb->list);
-	list_add(&jeb->list, &c->bad_list);
+	list_move(&jeb->list, &c->bad_list);
 	c->nr_erasing_blocks--;
 	spin_unlock(&c->erase_completion_lock);
 	wake_up(&c->erase_wait);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff68..ac0c350ed7d 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 			struct jffs2_eraseblock *ejeb;
 
 			ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
-			list_del(&ejeb->list);
-			list_add_tail(&ejeb->list, &c->erase_pending_list);
+			list_move_tail(&ejeb->list, &c->erase_pending_list);
 			c->nr_erasing_blocks++;
 			jffs2_erase_pending_trigger(c);
 			D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d..be1acc3dad9 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
 		return -ENOMEM;
 	}
 
-	dbg_summary("returned succesfully\n");
+	dbg_summary("returned successfully\n");
 
 	return 0;
 }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a7f153f79ec..b9b700730df 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
 	/* Fix up the original jeb now it's on the bad_list */
 	if (first_raw == jeb->first_node) {
 		D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
-		list_del(&jeb->list);
-		list_add(&jeb->list, &c->erase_pending_list);
+		list_move(&jeb->list, &c->erase_pending_list);
 		c->nr_erasing_blocks++;
 		jffs2_erase_pending_trigger(c);
 	}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358b..4d52593a5fc 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 
 	/* allocate the disk blocks for the extent.  initially, extBalloc()
 	 * will try to allocate disk blocks for the requested size (xlen). 
-	 * if this fails (xlen contigious free blocks not avaliable), it'll
+	 * if this fails (xlen contiguous free blocks not avaliable), it'll
 	 * try to allocate a smaller number of blocks (producing a smaller
 	 * extent), with this smaller number of blocks consisting of the
 	 * requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *
  *		initially, we will try to allocate disk blocks for the
  *		requested size (nblocks).  if this fails (nblocks 
- *		contigious free blocks not avaliable), we'll try to allocate
+ *		contiguous free blocks not avaliable), we'll try to allocate
  *		a smaller number of blocks (producing a smaller extent), with
  *		this smaller number of blocks consisting of the requested
  *		number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
-	 * number is greater than the maximum number of contigious free
+	 * number is greater than the maximum number of contiguous free
 	 * blocks in the map. in that case, we'll start off with the 
 	 * maximum free.
 	 */
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *		in place.  if this fails, we'll try to move the extent
  *		to a new set of blocks. if moving the extent, we initially
  *		will try to allocate disk blocks for the requested size
- *		(nnew).  if this fails 	(nnew contigious free blocks not
+ *		(nnew).  if this fails 	(new contiguous free blocks not
  *		avaliable), we'll try  to allocate a smaller number of
  *		blocks (producing a smaller extent), with this smaller
  *		number of blocks consisting of the requested number of
diff --git a/fs/libfs.c b/fs/libfs.c
index fc785d8befb..ac02ea602c3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			/* fallthrough */
 		default:
 			spin_lock(&dcache_lock);
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &dentry->d_subdirs);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &dentry->d_subdirs);
+
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
 					return 0;
 				spin_lock(&dcache_lock);
 				/* next is still alive */
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}
diff --git a/fs/namespace.c b/fs/namespace.c
index 866430bb024..b3ed212ea41 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
 
-	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		list_del(&p->mnt_hash);
-		list_add(&p->mnt_hash, kill);
-	}
+	for (p = mnt; p; p = next_mnt(p, mnt))
+		list_move(&p->mnt_hash, kill);
 
 	if (propagate)
 		propagate_umount(kill);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1..1630b5670dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
 
 	dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
 	list_del_init(&clp->cl_strhash);
-	list_del_init(&clp->cl_idhash);
-	list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
+	list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
 	strhashval = clientstr_hashval(clp->cl_recdir);
 	list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
 	renew_client(clp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e..fdf7cf3dfad 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
 static void
 lru_put_end(struct svc_cacherep *rp)
 {
-	list_del(&rp->c_lru);
-	list_add_tail(&rp->c_lru, &lru_head);
+	list_move_tail(&rp->c_lru, &lru_head);
 }
 
 /*
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef..42775e2bbe2 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 				  lock->ml.node == dlm->node_num ? "master" :
 				  "remote");
 			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
-		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
-			mlog(0, "setting lvb from lockres for %s node\n",
-				  lock->ml.node == dlm->node_num ? "master" :
-				  "remote");
-			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
 		}
+		/* Do nothing for lvb put requests - they should be done in
+ 		 * place when the lock is downconverted - otherwise we risk
+ 		 * racing gets and puts which could result in old lvb data
+ 		 * being propagated. We leave the put flag set and clear it
+ 		 * here. In the future we might want to clear it at the time
+ 		 * the put is actually done.
+		 */
 		spin_unlock(&res->spinlock);
 	}
 
@@ -381,8 +383,7 @@ do_ast:
 	ret = DLM_NORMAL;
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
-		list_del_init(&lock->list);
-		list_add_tail(&lock->list, &res->granted);
+		list_move_tail(&lock->list, &res->granted);
 		mlog(0, "ast: adding to granted list... type=%d, "
 			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f..9bdc9cf6599 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE_DEFAULT	(1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES		1
+#else
+# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
 enum dlm_ast_type {
 	DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
 	return 0;
 }
 
-#define DLM_RECO_STATE_ACTIVE  0x0001
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
 
 struct dlm_recovery_ctxt
 {
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
 	struct list_head list;
-	struct hlist_head *lockres_hash;
+	struct hlist_head **lockres_hash;
 	struct list_head dirty_list;
 	struct list_head purge_list;
 	struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
 	struct o2hb_callback_func dlm_hb_down;
 	struct task_struct *dlm_thread_task;
 	struct task_struct *dlm_reco_thread_task;
+	struct workqueue_struct *dlm_worker;
 	wait_queue_head_t dlm_thread_wq;
 	wait_queue_head_t dlm_reco_thread_wq;
 	wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
 	struct list_head	dlm_eviction_callbacks;
 };
 
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
 /* these keventd work queue items are for less-frequently
  * called functions that cannot be directly called from the
  * net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
 	/* WARNING: Please see the comment in dlm_init_lockres before
 	 * adding fields here. */
 	struct hlist_node hash_node;
+	struct qstr lockname;
 	struct kref      refs;
 
-	/* please keep these next 3 in this order
-	 * some funcs want to iterate over all lists */
+	/*
+	 * Please keep granted, converting, and blocked in this order,
+	 * as some funcs want to iterate over all lists.
+	 *
+	 * All four lists are protected by the hash's reference.
+	 */
 	struct list_head granted;
 	struct list_head converting;
 	struct list_head blocked;
+	struct list_head purge;
 
+	/*
+	 * These two lists require you to hold an additional reference
+	 * while they are on the list.
+	 */
 	struct list_head dirty;
 	struct list_head recovering; // dlm_recovery_ctxt.resources list
 
 	/* unused lock resources have their last_used stamped and are
 	 * put on a list for the dlm thread to run. */
-	struct list_head purge;
 	unsigned long    last_used;
 
 	unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
 	wait_queue_head_t wq;
 	u8  owner;              //node which owns the lock resource, or unknown
 	u16 state;
-	struct qstr lockname;
 	char lvb[DLM_LVB_LEN];
 };
 
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
 	DLM_BLOCKED_LIST
 };
 
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+	int i;
+	for (i=0; i<DLM_LVB_LEN; i++)
+		if (lvb[i])
+			return 0;
+	return 1;
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
 {
 	u8 node_idx;
 	u8 dead_node;
-	__be16 pad1;
+	u8 flags;
+	u8 pad1;
 	__be32 pad2;
 };
 
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			    struct dlm_lock_resource *res);
 void dlm_purge_lockres(struct dlm_ctxt *dlm,
 		       struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	/* This is called on every lookup, so it might be worth
+	 * inlining. */
+	kref_get(&res->refs);
+}
 void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 			  struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 						const char *name,
-						unsigned int len);
+						unsigned int len,
+						unsigned int hash);
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					      const char *name,
 					      unsigned int len);
@@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
 			   u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
 
 static inline const char * dlm_lock_mode_name(int mode)
 {
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e3..c764dc8e40a 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
 	if (lock->ml.node == dlm->node_num)
 		mlog(0, "doing in-place convert for nonlocal lock\n");
 	lock->ml.type = type;
+	if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+		memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
 	status = DLM_NORMAL;
 	*call_ast = 1;
 	goto unlock_exit;
@@ -231,8 +234,7 @@ switch_queues:
 
 	lock->ml.convert_type = type;
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->converting);
+	list_move_tail(&lock->list, &res->converting);
 
 unlock_exit:
 	spin_unlock(&lock->spinlock);
@@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
 				struct dlm_lock *lock)
 {
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->granted);
+	list_move_tail(&lock->list, &res->granted);
 	lock->ml.convert_type = LKM_IVMODE;
 	lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
 }
@@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
 	res->state |= DLM_LOCK_RES_IN_PROGRESS;
 	/* move lock to local convert queue */
 	/* do not alter lock refcount.  switching lists. */
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->converting);
+	list_move_tail(&lock->list, &res->converting);
 	lock->convert_pending = 1;
 	lock->ml.convert_type = type;
 
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	}
 
 	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL) {
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		goto leave;
+	}
 	list_for_each(iter, &res->granted) {
 		lock = list_entry(iter, struct dlm_lock, list);
 		if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 		}
 		lock = NULL;
 	}
+	if (!lock) {
+		__dlm_print_one_lock_resource(res);
+		list_for_each(iter, &res->granted) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.node == cnv->node_idx) {
+				mlog(ML_ERROR, "There is something here "
+				     "for node %u, lock->ml.cookie=%llu, "
+				     "cnv->cookie=%llu\n", cnv->node_idx,
+				     (unsigned long long)lock->ml.cookie,
+				     (unsigned long long)cnv->cookie);
+				break;
+			}
+		}
+		lock = NULL;
+	}
 	spin_unlock(&res->spinlock);
 	if (!lock) {
 		status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324..3f6c8d88f7a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 
 #include "dlmdomain.h"
-#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 
+#if 0
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
 	struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 
 	spin_lock(&dlm->spinlock);
 	for (i=0; i<DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node)
 			dlm_print_one_lock_resource(res);
 	}
 	spin_unlock(&dlm->spinlock);
 }
+#endif  /*  0  */
 
 static const char *dlm_errnames[] = {
 	[DLM_NORMAL] =			"DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3cc..00000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106f..ba27c5c5e95 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
 #include "dlmapi.h"
 #include "dlmcommon.h"
 
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #include "dlmver.h"
@@ -49,6 +48,33 @@
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
 
+static void dlm_free_pagevec(void **vec, int pages)
+{
+	while (pages--)
+		free_page((unsigned long)vec[pages]);
+	kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+	void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	int i;
+
+	if (!vec)
+		return NULL;
+
+	for (i = 0; i < pages; i++)
+		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+			goto out_free;
+
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+	     pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+	return vec;
+out_free:
+	dlm_free_pagevec(vec, i);
+	return NULL;
+}
+
 /*
  *
  * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
-	q->hash = full_name_hash(q->name, q->len);
-	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
+	bucket = dlm_lockres_hash(dlm, q->hash);
 
 	/* get a reference for our hashtable */
 	dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 }
 
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
-					 const char *name,
-					 unsigned int len)
+						const char *name,
+						unsigned int len,
+						unsigned int hash)
 {
-	unsigned int hash;
-	struct hlist_node *iter;
-	struct dlm_lock_resource *tmpres=NULL;
 	struct hlist_head *bucket;
+	struct hlist_node *list;
 
 	mlog_entry("%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
-	hash = full_name_hash(name, len);
-
-	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-
-	/* check for pre-existing lock */
-	hlist_for_each(iter, bucket) {
-		tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
-		if (tmpres->lockname.len == len &&
-		    memcmp(tmpres->lockname.name, name, len) == 0) {
-			dlm_lockres_get(tmpres);
-			break;
-		}
+	bucket = dlm_lockres_hash(dlm, hash);
 
-		tmpres = NULL;
+	hlist_for_each(list, bucket) {
+		struct dlm_lock_resource *res = hlist_entry(list,
+			struct dlm_lock_resource, hash_node);
+		if (res->lockname.name[0] != name[0])
+			continue;
+		if (unlikely(res->lockname.len != len))
+			continue;
+		if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+			continue;
+		dlm_lockres_get(res);
+		return res;
 	}
-	return tmpres;
+	return NULL;
 }
 
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 				    unsigned int len)
 {
 	struct dlm_lock_resource *res;
+	unsigned int hash = dlm_lockid_hash(name, len);
 
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, len);
+	res = __dlm_lookup_lockres(dlm, name, len, hash);
 	spin_unlock(&dlm->spinlock);
 	return res;
 }
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
 	if (dlm->lockres_hash)
-		free_page((unsigned long) dlm->lockres_hash);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
 	if (dlm->name)
 		kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
 	return ret;
 }
 
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_worker) {
+		flush_workqueue(dlm->dlm_worker);
+		destroy_workqueue(dlm->dlm_worker);
+		dlm->dlm_worker = NULL;
+	}
+}
+
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
+	dlm_destroy_dlm_worker(dlm);
 
 	/* We've left the domain. Now we can take ourselves out of the
 	 * list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 restart:
 	spin_lock(&dlm->spinlock);
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		while (!hlist_empty(&dlm->lockres_hash[i])) {
-			res = hlist_entry(dlm->lockres_hash[i].first,
+		while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
+			res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
 					  struct dlm_lock_resource, hash_node);
 			/* need reference when manually grabbing lockres */
 			dlm_lockres_get(res);
@@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
 		goto bail;
 	}
 
+	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	if (!dlm->dlm_worker) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	do {
 		unsigned int backoff;
 		status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1207,7 @@ bail:
 		dlm_unregister_domain_handlers(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
+		dlm_destroy_dlm_worker(dlm);
 	}
 
 	return status;
@@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
 	if (!dlm->lockres_hash) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm->name);
@@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 		goto leave;
 	}
 
-	for (i=0; i<DLM_HASH_BUCKETS; i++)
-		INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
 
 	strcpy(dlm->name, domain);
 	dlm->key = key;
@@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 
 	dlm->dlm_thread_task = NULL;
 	dlm->dlm_reco_thread_task = NULL;
+	dlm->dlm_worker = NULL;
 	init_waitqueue_head(&dlm->dlm_thread_wq);
 	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
 	init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7273d9fa6ba..033ad170123 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
 	 * doesn't make sense for LVB writes. */
 	file->f_flags &= ~O_APPEND;
 
-	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
 	if (!fp) {
 		status = -ENOMEM;
 		goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
 	else
 		readlen = count - *ppos;
 
-	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	else
 		writelen = count - *ppos;
 
-	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6..d6f89577e25 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 				      struct dlm_lock *lock, int flags)
 {
 	enum dlm_status status = DLM_DENIED;
+	int lockres_changed = 1;
 
 	mlog_entry("type=%d\n", lock->ml.type);
 	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
 	lock->lock_pending = 0;
 	if (status != DLM_NORMAL) {
-		if (status != DLM_NOTQUEUED)
+		if (status == DLM_RECOVERING &&
+		    dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			/* recovery lock was mastered by dead node.
+			 * we need to have calc_usage shoot down this
+			 * lockres and completely remaster it. */
+			mlog(0, "%s: recovery lock was owned by "
+			     "dead node %u, remaster it now.\n",
+			     dlm->name, res->owner);
+		} else if (status != DLM_NOTQUEUED) {
+			/*
+			 * DO NOT call calc_usage, as this would unhash
+			 * the remote lockres before we ever get to use
+			 * it.  treat as if we never made any change to
+			 * the lockres.
+			 */
+			lockres_changed = 0;
 			dlm_error(status);
+		}
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
 	} else if (dlm_is_recovery_lock(res->lockname.name, 
@@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 		mlog(0, "%s: $RECOVERY lock for this node (%u) is "
 		     "mastered by %u; got lock, manually granting (no ast)\n",
 		     dlm->name, dlm->node_num, res->owner);
-		list_del_init(&lock->list);
-		list_add_tail(&lock->list, &res->granted);
+		list_move_tail(&lock->list, &res->granted);
 	}
 	spin_unlock(&res->spinlock);
 
-	dlm_lockres_calc_usage(dlm, res);
+	if (lockres_changed)
+		dlm_lockres_calc_usage(dlm, res);
 
 	wake_up(&res->wq);
 	return status;
@@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
 	if (tmpret >= 0) {
 		// successfully sent and received
 		ret = status;  // this is already a dlm_status
+		if (ret == DLM_REJECTED) {
+			mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+			     "no longer owned by %u.  that node is coming back "
+			     "up currently.\n", dlm->name, create.namelen,
+			     create.name, res->owner);
+			dlm_print_one_lock_resource(res);
+			BUG();
+		}
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
@@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
 	struct dlm_lock *lock;
 	int kernel_allocated = 0;
 
-	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
 	if (!lock)
 		return NULL;
 
 	if (!lksb) {
 		/* zero memory only if kernel-allocated */
-		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
 		if (!lksb) {
 			kfree(lock);
 			return NULL;
@@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return DLM_REJECTED;
 
-	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
-			"Domain %s not fully joined!\n", dlm->name);
-
 	name = create->name;
 	namelen = create->namelen;
+	status = DLM_REJECTED;
+	if (!dlm_domain_fully_joined(dlm)) {
+		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+		     "sending a create_lock message for lock %.*s!\n",
+		     dlm->name, create->node_idx, namelen, name);
+		dlm_error(status);
+		goto leave;
+	}
 
 	status = DLM_IVBUFLEN;
 	if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +700,22 @@ retry_lock:
 			msleep(100);
 			/* no waiting for dlm_reco_thread */
 			if (recovery) {
-				if (status == DLM_RECOVERING) {
-					mlog(0, "%s: got RECOVERING "
-					     "for $REOCVERY lock, master "
-					     "was %u\n", dlm->name, 
-					     res->owner);
-					dlm_wait_for_node_death(dlm, res->owner, 
-							DLM_NODE_DEATH_WAIT_MAX);
-				}
+				if (status != DLM_RECOVERING)
+					goto retry_lock;
+
+				mlog(0, "%s: got RECOVERING "
+				     "for $RECOVERY lock, master "
+				     "was %u\n", dlm->name,
+				     res->owner);
+				/* wait to see the node go down, then
+				 * drop down and allow the lockres to
+				 * get cleaned up.  need to remaster. */
+				dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
 			} else {
 				dlm_wait_for_recovery(dlm);
+				goto retry_lock;
 			}
-			goto retry_lock;
 		}
 
 		if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1..1b8346dd057 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
 	wait_queue_head_t wq;
 	atomic_t woken;
 	struct kref mle_refs;
+	int inuse;
 	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 	return 1;
 }
 
-#if 0
-/* Code here is included but defined out as it aids debugging */
+#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+	int i;
+	printk("%s=[ ", mapname);
+	for (i=0; i<O2NM_MAX_NODES; i++)
+		if (test_bit(i, map))
+			printk("%d ", i);
+	printk("]");
+}
 
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
-	int i = 0, refs;
+	int refs;
 	char *type;
 	char attached;
 	u8 master;
 	unsigned int namelen;
 	const char *name;
 	struct kref *k;
+	unsigned long *maybe = mle->maybe_map,
+		      *vote = mle->vote_map,
+		      *resp = mle->response_map,
+		      *node = mle->node_map;
 
 	k = &mle->mle_refs;
 	if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 		name = mle->u.res->lockname.name;
 	}
 
-	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
-		  i, type, refs, master, mle->new_master, attached,
-		  namelen, namelen, name);
+	mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
+		  namelen, name, type, refs, master, mle->new_master, attached,
+		  mle->inuse);
+	dlm_print_nodemap(maybe);
+	printk(", ");
+	dlm_print_nodemap(vote);
+	printk(", ");
+	dlm_print_nodemap(resp);
+	printk(", ");
+	dlm_print_nodemap(node);
+	printk(", ");
+	printk("\n");
 }
 
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
 	struct dlm_master_list_entry *mle;
 	struct list_head *iter;
 	
 	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
 	spin_lock(&dlm->master_lock);
 	list_for_each(iter, &dlm->master_list) {
 		mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 	spin_unlock(&dlm->spinlock);
 }
 
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	mle->inuse++;
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	mle->inuse--;
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+}
+
 /* remove from list and free */
 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 {
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
-
-	kref_put(&mle->mle_refs, dlm_mle_release);
+	if (!atomic_read(&mle->mle_refs.refcount)) {
+		/* this may or may not crash, but who cares.
+		 * it's a BUG. */
+		mlog(ML_ERROR, "bad mle: %p\n", mle);
+		dlm_print_one_mle(mle);
+		BUG();
+	} else
+		kref_put(&mle->mle_refs, dlm_mle_release);
 }
 
 
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
 	memset(mle->response_map, 0, sizeof(mle->response_map));
 	mle->master = O2NM_MAX_NODES;
 	mle->new_master = O2NM_MAX_NODES;
+	mle->inuse = 0;
 
 	if (mle->type == DLM_MLE_MASTER) {
 		BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	if (!hlist_unhashed(&res->hash_node) ||
+	    !list_empty(&res->granted) ||
+	    !list_empty(&res->converting) ||
+	    !list_empty(&res->blocked) ||
+	    !list_empty(&res->dirty) ||
+	    !list_empty(&res->recovering) ||
+	    !list_empty(&res->purge)) {
+		mlog(ML_ERROR,
+		     "Going to BUG for resource %.*s."
+		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+		     res->lockname.len, res->lockname.name,
+		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+		     !list_empty(&res->granted) ? 'G' : ' ',
+		     !list_empty(&res->converting) ? 'C' : ' ',
+		     !list_empty(&res->blocked) ? 'B' : ' ',
+		     !list_empty(&res->dirty) ? 'D' : ' ',
+		     !list_empty(&res->recovering) ? 'R' : ' ',
+		     !list_empty(&res->purge) ? 'P' : ' ');
+
+		dlm_print_one_lock_resource(res);
+	}
+
 	/* By the time we're ready to blow this guy away, we shouldn't
 	 * be on any lists. */
 	BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref)
 	kfree(res);
 }
 
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
-	kref_get(&res->refs);
-}
-
 void dlm_lockres_put(struct dlm_lock_resource *res)
 {
 	kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	memcpy(qname, name, namelen);
 
 	res->lockname.len = namelen;
-	res->lockname.hash = full_name_hash(name, namelen);
+	res->lockname.hash = dlm_lockid_hash(name, namelen);
 
 	init_waitqueue_head(&res->wq);
 	spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock_resource *res;
 
-	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
 	if (!res)
 		return NULL;
 
-	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	res->lockname.name = kmalloc(namelen, GFP_NOFS);
 	if (!res->lockname.name) {
 		kfree(res);
 		return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 	int blocked = 0;
 	int ret, nodenum;
 	struct dlm_node_iter iter;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int tries = 0;
 	int bit, wait_on_recovery = 0;
 
 	BUG_ON(!lockid);
 
 	namelen = strlen(lockid);
+	hash = dlm_lockid_hash(lockid, namelen);
 
 	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 
 lookup:
 	spin_lock(&dlm->spinlock);
-	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
 	if (tmpres) {
 		spin_unlock(&dlm->spinlock);
 		mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
 		mlog(0, "allocating a new resource\n");
 		/* nothing found and we need to allocate one. */
 		alloc_mle = (struct dlm_master_list_entry *)
-			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 		if (!alloc_mle)
 			goto leave;
 		res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
 	 * if so, the creator of the BLOCK may try to put the last
 	 * ref at this time in the assert master handler, so we
 	 * need an extra one to keep from a bad ptr deref. */
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
+redo_request:
 	while (wait_on_recovery) {
 		/* any cluster changes that occurred after dropping the
 		 * dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
 		} 
 
 		dlm_kick_recovery_thread(dlm);
-		msleep(100);
+		msleep(1000);
 		dlm_wait_for_recovery(dlm);
 
 		spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
 		} else
 			wait_on_recovery = 0;
 		spin_unlock(&dlm->spinlock);
+
+		if (wait_on_recovery)
+			dlm_wait_for_node_recovery(dlm, bit, 10000);
 	}
 
 	/* must wait for lock to be mastered elsewhere */
 	if (blocked)
 		goto wait;
 
-redo_request:
 	ret = -EINVAL;
 	dlm_node_iter_init(mle->vote_map, &iter);
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
 	/* keep going until the response map includes all nodes */
 	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
 	if (ret < 0) {
+		wait_on_recovery = 1;
 		mlog(0, "%s:%.*s: node map changed, redo the "
 		     "master request now, blocked=%d\n",
 		     dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
 			     dlm->name, res->lockname.len, 
 			     res->lockname.name, blocked);
 			dlm_print_one_lock_resource(res);
-			/* dlm_print_one_mle(mle); */
+			dlm_print_one_mle(mle);
 			tries = 0;
 		}
 		goto redo_request;
@@ -880,7 +956,7 @@ wait:
 	dlm_mle_detach_hb_events(dlm, mle);
 	dlm_put_mle(mle);
 	/* put the extra ref */
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 
 wake_waiters:
 	spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
 		spin_unlock(&res->spinlock);
 		/* this will cause the master to re-assert across
 		 * the whole cluster, freeing up mles */
-		ret = dlm_do_master_request(mle, res->owner);
-		if (ret < 0) {
-			/* give recovery a chance to run */
-			mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-			msleep(500);
-			goto recheck;
+		if (res->owner != dlm->node_num) {
+			ret = dlm_do_master_request(mle, res->owner);
+			if (ret < 0) {
+				/* give recovery a chance to run */
+				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+				msleep(500);
+				goto recheck;
+			}
 		}
 		ret = 0;
 		goto leave;
@@ -962,6 +1040,12 @@ recheck:
 		     "rechecking now\n", dlm->name, res->lockname.len,
 		     res->lockname.name);
 		goto recheck;
+	} else {
+		if (!voting_done) {
+			mlog(0, "map not changed and voting not done "
+			     "for %s:%.*s\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		}
 	}
 
 	if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 			set_bit(node, mle->vote_map);
 		} else {
 			mlog(ML_ERROR, "node down! %d\n", node);
-
-			/* if the node wasn't involved in mastery skip it,
-			 * but clear it out from the maps so that it will
-			 * not affect mastery of this lockres */
-			clear_bit(node, mle->response_map);
-			clear_bit(node, mle->vote_map);
-			if (!test_bit(node, mle->maybe_map))
-				goto next;
-
-			/* if we're already blocked on lock mastery, and the
-			 * dead node wasn't the expected master, or there is
-			 * another node in the maybe_map, keep waiting */
 			if (blocked) {
 				int lowest = find_next_bit(mle->maybe_map,
 						       O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 				/* act like it was never there */
 				clear_bit(node, mle->maybe_map);
 
-			       	if (node != lowest)
-					goto next;
-
-				mlog(ML_ERROR, "expected master %u died while "
-				     "this node was blocked waiting on it!\n",
-				     node);
-				lowest = find_next_bit(mle->maybe_map,
-						       O2NM_MAX_NODES,
-						       lowest+1);
-				if (lowest < O2NM_MAX_NODES) {
-					mlog(0, "still blocked. waiting "
-					     "on %u now\n", lowest);
-					goto next;
+			       	if (node == lowest) {
+					mlog(0, "expected master %u died"
+					    " while this node was blocked "
+					    "waiting on it!\n", node);
+					lowest = find_next_bit(mle->maybe_map,
+						       	O2NM_MAX_NODES,
+						       	lowest+1);
+					if (lowest < O2NM_MAX_NODES) {
+						mlog(0, "%s:%.*s:still "
+						     "blocked. waiting on %u "
+						     "now\n", dlm->name,
+						     res->lockname.len,
+						     res->lockname.name,
+						     lowest);
+					} else {
+						/* mle is an MLE_BLOCK, but
+						 * there is now nothing left to
+						 * block on.  we need to return
+						 * all the way back out and try
+						 * again with an MLE_MASTER.
+						 * dlm_do_local_recovery_cleanup
+						 * has already run, so the mle
+						 * refcount is ok */
+						mlog(0, "%s:%.*s: no "
+						     "longer blocking. try to "
+						     "master this here\n",
+						     dlm->name,
+						     res->lockname.len,
+						     res->lockname.name);
+						mle->type = DLM_MLE_MASTER;
+						mle->u.res = res;
+					}
 				}
-
-				/* mle is an MLE_BLOCK, but there is now
-				 * nothing left to block on.  we need to return
-				 * all the way back out and try again with
-				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
-				 * has already run, so the mle refcount is ok */
-				mlog(0, "no longer blocking. we can "
-				     "try to master this here\n");
-				mle->type = DLM_MLE_MASTER;
-				memset(mle->maybe_map, 0,
-				       sizeof(mle->maybe_map));
-				memset(mle->response_map, 0,
-				       sizeof(mle->maybe_map));
-				memcpy(mle->vote_map, mle->node_map,
-				       sizeof(mle->node_map));
-				mle->u.res = res;
-				set_bit(dlm->node_num, mle->maybe_map);
-
-				ret = -EAGAIN;
-				goto next;
 			}
 
-			clear_bit(node, mle->maybe_map);
-			if (node > dlm->node_num)
-				goto next;
-
-			mlog(0, "dead node in map!\n");
-			/* yuck. go back and re-contact all nodes
-			 * in the vote_map, removing this node. */
-			memset(mle->response_map, 0,
-			       sizeof(mle->response_map));
+			/* now blank out everything, as if we had never
+			 * contacted anyone */
+			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+			memset(mle->response_map, 0, sizeof(mle->response_map));
+			/* reset the vote_map to the current node_map */
+			memcpy(mle->vote_map, mle->node_map,
+			       sizeof(mle->node_map));
+			/* put myself into the maybe map */
+			if (mle->type != DLM_MLE_BLOCK)
+				set_bit(dlm->node_num, mle->maybe_map);
 		}
 		ret = -EAGAIN;
-next:
 		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
 	}
 	return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int found, ret;
 	int set_maybe;
 	int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = request->name;
 	namelen = request->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
 		response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 way_up_top:
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_unlock(&dlm->spinlock);
 
@@ -1459,21 +1531,18 @@ way_up_top:
 			spin_unlock(&dlm->spinlock);
 
 			mle = (struct dlm_master_list_entry *)
-				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+				kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 			if (!mle) {
 				response = DLM_MASTER_RESP_ERROR;
 				mlog_errno(-ENOMEM);
 				goto send_response;
 			}
-			spin_lock(&dlm->spinlock);
-			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
-					 name, namelen);
-			spin_unlock(&dlm->spinlock);
 			goto way_up_top;
 		}
 
 		// mlog(0, "this is second time thru, already allocated, "
 		// "add the block.\n");
+		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
 		set_bit(request->node_idx, mle->maybe_map);
 		list_add(&mle->list, &dlm->master_list);
 		response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
 	dlm_node_iter_init(nodemap, &iter);
 	while ((to = dlm_node_iter_next(&iter)) >= 0) {
 		int r = 0;
+		struct dlm_master_list_entry *mle = NULL;
+
 		mlog(0, "sending assert master to %d (%.*s)\n", to,
 		     namelen, lockname);
 		memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
 		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
 					    &assert, sizeof(assert), to, &r);
 		if (tmpret < 0) {
-			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			mlog(0, "assert_master returned %d!\n", tmpret);
 			if (!dlm_is_host_down(tmpret)) {
-				mlog(ML_ERROR, "unhandled error!\n");
+				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
 				BUG();
 			}
 			/* a node died.  finish out the rest of the nodes. */
-			mlog(ML_ERROR, "link to %d went down!\n", to);
+			mlog(0, "link to %d went down!\n", to);
 			/* any nonzero status return will do */
 			ret = tmpret;
 		} else if (r < 0) {
 			/* ok, something horribly messed.  kill thyself. */
 			mlog(ML_ERROR,"during assert master of %.*s to %u, "
 			     "got %d.\n", namelen, lockname, to, r);
-			dlm_dump_lock_resources(dlm);
+			spin_lock(&dlm->spinlock);
+			spin_lock(&dlm->master_lock);
+			if (dlm_find_mle(dlm, &mle, (char *)lockname,
+					 namelen)) {
+				dlm_print_one_mle(mle);
+				__dlm_put_mle(mle);
+			}
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
 			BUG();
 		} else if (r == EAGAIN) {
 			mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	u32 flags;
 	int master_request = 0;
 	int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = assert->name;
 	namelen = assert->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 	flags = be32_to_cpu(assert->flags);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 		if (bit >= O2NM_MAX_NODES) {
 			/* not necessarily an error, though less likely.
 			 * could be master just re-asserting. */
-			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			mlog(0, "no bits set in the maybe_map, but %u "
 			     "is asserting! (%.*s)\n", assert->node_idx,
 			     namelen, name);
 		} else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 				 * number winning the mastery will respond
 				 * YES to mastery requests, but this node
 				 * had no way of knowing.  let it pass. */
-				mlog(ML_ERROR, "%u is the lowest node, "
+				mlog(0, "%u is the lowest node, "
 				     "%u is asserting. (%.*s)  %u must "
 				     "have begun after %u won.\n", bit,
 				     assert->node_idx, namelen, name, bit,
 				     assert->node_idx);
 			}
 		}
+		if (mle->type == DLM_MLE_MIGRATION) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "%s:%.*s: got cleanup assert"
+				     " from %u for migration\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+				mlog(0, "%s:%.*s: got unrelated assert"
+				     " from %u for migration, ignoring\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+				__dlm_put_mle(mle);
+				spin_unlock(&dlm->master_lock);
+				spin_unlock(&dlm->spinlock);
+				goto done;
+			}	
+		}
 	}
 	spin_unlock(&dlm->master_lock);
 
 	/* ok everything checks out with the MLE
 	 * now check to see if there is a lockres */
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
 			goto kill;
 		}
 		if (!mle) {
-			if (res->owner != assert->node_idx) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+			    res->owner != assert->node_idx) {
 				mlog(ML_ERROR, "assert_master from "
 					  "%u, but current owner is "
 					  "%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
 	if (mle) {
 		int extra_ref = 0;
 		int nn = -1;
+		int rr, err = 0;
 		
 		spin_lock(&mle->spinlock);
 		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
 		wake_up(&mle->wq);
 		spin_unlock(&mle->spinlock);
 
-		if (mle->type == DLM_MLE_MIGRATION && res) {
-			mlog(0, "finishing off migration of lockres %.*s, "
-			     "from %u to %u\n",
-			       res->lockname.len, res->lockname.name,
-			       dlm->node_num, mle->new_master);
+		if (res) {
 			spin_lock(&res->spinlock);
-			res->state &= ~DLM_LOCK_RES_MIGRATING;
-			dlm_change_lockres_owner(dlm, res, mle->new_master);
-			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			if (mle->type == DLM_MLE_MIGRATION) {
+				mlog(0, "finishing off migration of lockres %.*s, "
+			     		"from %u to %u\n",
+			       		res->lockname.len, res->lockname.name,
+			       		dlm->node_num, mle->new_master);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				dlm_change_lockres_owner(dlm, res, mle->new_master);
+				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			} else {
+				dlm_change_lockres_owner(dlm, res, mle->master);
+			}
 			spin_unlock(&res->spinlock);
 		}
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, mle);
-		dlm_put_mle(mle);
-		
+
+		/* master is known, detach if not already detached.
+		 * ensures that only one assert_master call will happen
+		 * on this mle. */
+		spin_lock(&dlm->spinlock);
+		spin_lock(&dlm->master_lock);
+
+		rr = atomic_read(&mle->mle_refs.refcount);
+		if (mle->inuse > 0) {
+			if (extra_ref && rr < 3)
+				err = 1;
+			else if (!extra_ref && rr < 2)
+				err = 1;
+		} else {
+			if (extra_ref && rr < 2)
+				err = 1;
+			else if (!extra_ref && rr < 1)
+				err = 1;
+		}
+		if (err) {
+			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+			     "that will mess up this node, refs=%d, extra=%d, "
+			     "inuse=%d\n", dlm->name, namelen, name,
+			     assert->node_idx, rr, extra_ref, mle->inuse);
+			dlm_print_one_mle(mle);
+		}
+		list_del_init(&mle->list);
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
 		if (extra_ref) {
 			/* the assert master message now balances the extra
 		 	 * ref given by the master / migration request message.
 		 	 * if this is the last put, it will be removed
 		 	 * from the list. */
-			dlm_put_mle(mle);
+			__dlm_put_mle(mle);
+		}
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&dlm->spinlock);
+	} else if (res) {
+		if (res->owner != assert->node_idx) {
+			mlog(0, "assert_master from %u, but current "
+			     "owner is %u (%.*s), no mle\n", assert->node_idx,
+			     res->owner, namelen, name);
 		}
 	}
 
@@ -1788,12 +1924,12 @@ done:
 
 kill:
 	/* kill the caller! */
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
 	spin_unlock(&dlm->spinlock);
 	dlm_lockres_put(res);
-	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-	     "and killing the other node now!  This node is OK and can continue.\n");
-	dlm_dump_lock_resources(dlm);
 	dlm_put(dlm);
 	return -EINVAL;
 }
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 			       int ignore_higher, u8 request_from, u32 flags)
 {
 	struct dlm_work_item *item;
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item)
 		return -ENOMEM;
 
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
 
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 	return 0;
 }
 
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 		}
 	}
 
+	/*
+	 * If we're migrating this lock to someone else, we are no
+	 * longer allowed to assert out own mastery.  OTOH, we need to
+	 * prevent migration from starting while we're still asserting
+	 * our dominance.  The reserved ast delays migration.
+	 */
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Someone asked us to assert mastery, but we're "
+		     "in the middle of migration.  Skipping assert, "
+		     "the new master will handle that.\n");
+		spin_unlock(&res->spinlock);
+		goto put;
+	} else
+		__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
 	/* this call now finishes out the nodemap
 	 * even if one or more nodes die */
 	mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
 				   nodemap, flags);
 	if (ret < 0) {
 		/* no need to restart, we are done */
-		mlog_errno(ret);
+		if (!dlm_is_host_down(ret))
+			mlog_errno(ret);
 	}
 
+	/* Ok, we've asserted ourselves.  Let's let migration start. */
+	dlm_lockres_release_ast(dlm, res);
+
+put:
 	dlm_lockres_put(res);
 
 	mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
 				BUG();
 			/* host is down, so answer for that node would be
 			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+			ret = 0;
 		}
 
 		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 	 */
 
 	ret = -ENOMEM;
-	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
 	if (!mres) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-								GFP_KERNEL);
+								GFP_NOFS);
 	if (!mle) {
 		mlog_errno(ret);
 		goto leave;
@@ -2117,7 +2276,7 @@ fail:
 	 * take both dlm->spinlock and dlm->master_lock */
 	spin_lock(&dlm->spinlock);
 	spin_lock(&dlm->master_lock);
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
@@ -2134,7 +2293,10 @@ fail:
 		/* migration failed, detach and clean up mle */
 		dlm_mle_detach_hb_events(dlm, mle);
 		dlm_put_mle(mle);
-		dlm_put_mle(mle);
+		dlm_put_mle_inuse(mle);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
 		goto leave;
 	}
 
@@ -2164,8 +2326,8 @@ fail:
 			/* avoid hang during shutdown when migrating lockres 
 			 * to a node which also goes down */
 			if (dlm_is_node_dead(dlm, target)) {
-				mlog(0, "%s:%.*s: expected migration target %u "
-				     "is no longer up.  restarting.\n",
+				mlog(0, "%s:%.*s: expected migration "
+				     "target %u is no longer up, restarting\n",
 				     dlm->name, res->lockname.len,
 				     res->lockname.name, target);
 				ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
 			/* migration failed, detach and clean up mle */
 			dlm_mle_detach_hb_events(dlm, mle);
 			dlm_put_mle(mle);
-			dlm_put_mle(mle);
+			dlm_put_mle_inuse(mle);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			spin_unlock(&res->spinlock);
 			goto leave;
 		}
 		/* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
 
 	/* master is known, detach if not already detached */
 	dlm_mle_detach_hb_events(dlm, mle);
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 	ret = 0;
 
 	dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
 	const char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int ret = 0;
 
 	if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	name = migrate->name;
 	namelen = migrate->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	/* preallocate.. if this fails, abort */
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-							 GFP_KERNEL);
+							 GFP_NOFS);
 
 	if (!mle) {
 		ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
 
 	/* check for pre-existing lock */
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	spin_lock(&dlm->master_lock);
 
 	if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 			/* remove it from the list so that only one
 			 * mle will be found */
 			list_del_init(&tmp->list);
+			__dlm_mle_detach_hb_events(dlm, mle);
 		}
 		spin_unlock(&tmp->spinlock);
 	}
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 	struct list_head *iter, *iter2;
 	struct dlm_master_list_entry *mle;
 	struct dlm_lock_resource *res;
+	unsigned int hash;
 
 	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2640,7 +2808,7 @@ top:
 				 * may result in the mle being unlinked and
 				 * freed, but there may still be a process
 				 * waiting in the dlmlock path which is fine. */
-				mlog(ML_ERROR, "node %u was expected master\n",
+				mlog(0, "node %u was expected master\n",
 				     dead_node);
 				atomic_set(&mle->woken, 1);
 				spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
 
 		/* remove from the list early.  NOTE: unlinking
 		 * list_head while in list_for_each_safe */
+		__dlm_mle_detach_hb_events(dlm, mle);
 		spin_lock(&mle->spinlock);
 		list_del_init(&mle->list);
 		atomic_set(&mle->woken, 1);
 		spin_unlock(&mle->spinlock);
 		wake_up(&mle->wq);
 
-		mlog(0, "node %u died during migration from "
-		     "%u to %u!\n", dead_node,
+		mlog(0, "%s: node %u died during migration from "
+		     "%u to %u!\n", dlm->name, dead_node,
 		     mle->master, mle->new_master);
 		/* if there is a lockres associated with this
 	 	 * mle, find it and set its owner to UNKNOWN */
+		hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
 		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-					mle->u.name.len);
+					   mle->u.name.len, hash);
 		if (res) {
 			/* unfortunately if we hit this rare case, our
 		 	 * lock ordering is messed.  we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac05..da399013516 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void)
 	return c;
 }
 
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+					  u8 dead_node)
+{
+	assert_spin_locked(&dlm->spinlock);
+	if (dlm->reco.dead_node != dead_node)
+		mlog(0, "%s: changing dead_node from %u to %u\n",
+		     dlm->name, dlm->reco.dead_node, dead_node);
+	dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+				       u8 master)
+{
+	assert_spin_locked(&dlm->spinlock);
+	mlog(0, "%s: changing new_master from %u to %u\n",
+	     dlm->name, dlm->reco.new_master, master);
+	dlm->reco.new_master = master;
+}
+
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	assert_spin_locked(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
 static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
 	spin_lock(&dlm->spinlock);
-	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
-	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	__dlm_reset_recovery(dlm);
 	spin_unlock(&dlm->spinlock);
 }
 
@@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data)
 	struct list_head *iter, *iter2;
 	struct dlm_work_item *item;
 	dlm_workfunc_t *workfunc;
+	int tot=0;
+
+	if (!dlm_joined(dlm))
+		return;
 
 	spin_lock(&dlm->work_lock);
 	list_splice_init(&dlm->work_list, &tmp_list);
 	spin_unlock(&dlm->work_lock);
 
 	list_for_each_safe(iter, iter2, &tmp_list) {
+		tot++;
+	}
+	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
 		item = list_entry(iter, struct dlm_work_item, list);
 		workfunc = item->func;
 		list_del_init(&item->list);
@@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
  *
  */
 
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+	struct dlm_reco_node_data *ndata;
+	struct dlm_lock_resource *res;
+
+	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
+	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		char *st = "unknown";
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+				st = "init";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				st = "requesting";
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				st = "dead";
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				st = "receiving";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				st = "requested";
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				st = "done";
+				break;
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				st = "finalize-sent";
+				break;
+			default:
+				st = "bad";
+				break;
+		}
+		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+		     dlm->name, ndata->node_num, st);
+	}
+	list_for_each_entry(res, &dlm->reco.resources, recovering) {
+		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+	}
+}
 
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
 
@@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
 {
 	int dead;
 	spin_lock(&dlm->spinlock);
-	dead = test_bit(node, dlm->domain_map);
+	dead = !test_bit(node, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 	return dead;
 }
 
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+	int recovered;
+	spin_lock(&dlm->spinlock);
+	recovered = !test_bit(node, dlm->recovery_map);
+	spin_unlock(&dlm->spinlock);
+	return recovered;
+}
+
+
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
 	if (timeout) {
@@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 	return 0;
 }
 
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(0, "%s: waiting %dms for notification of "
+		     "recovery of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(0, "%s: waiting indefinitely for notification "
+		     "of recovery of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+	if (dlm_in_recovery(dlm)) {
+		mlog(0, "%s: reco thread %d in recovery: "
+		     "state=%d, master=%u, dead=%u\n",
+		     dlm->name, dlm->dlm_reco_thread_task->pid,
+		     dlm->reco.state, dlm->reco.new_master,
+		     dlm->reco.dead_node);
+	}
 	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
 
@@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		mlog(0, "new master %u died while recovering %u!\n",
 		     dlm->reco.new_master, dlm->reco.dead_node);
 		/* unset the new_master, leave dead_node */
-		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	/* select a target to recover */
@@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 
 		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
 		if (bit >= O2NM_MAX_NODES || bit < 0)
-			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 		else
-			dlm->reco.dead_node = bit;
+			dlm_set_reco_dead_node(dlm, bit);
 	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
 		/* BUG? */
 		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
 		     dlm->reco.dead_node);
-		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		/* return to main thread loop and sleep. */
 		return 0;
 	}
-	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
 	     dlm->reco.dead_node);
 	spin_unlock(&dlm->spinlock);
 
@@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 		}
 		mlog(0, "another node will master this recovery session.\n");
 	}
-	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
-	     dlm->name, dlm->reco.new_master,
+	mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
 	     dlm->node_num, dlm->reco.dead_node);
 
 	/* it is safe to start everything back up here
@@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	return 0;
 
 master_here:
-	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->dlm_reco_thread_task->pid,
 	     dlm->name, dlm->reco.dead_node, dlm->node_num);
 
 	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
 	if (status < 0) {
+		/* we should never hit this anymore */
 		mlog(ML_ERROR, "error %d remastering locks for node %u, "
 		     "retrying.\n", status, dlm->reco.dead_node);
 		/* yield a bit to allow any final network messages
@@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 	int destroy = 0;
 	int pass = 0;
 
-	status = dlm_init_recovery_area(dlm, dead_node);
-	if (status < 0)
-		goto leave;
+	do {
+		/* we have become recovery master.  there is no escaping
+		 * this, so just keep trying until we get it. */
+		status = dlm_init_recovery_area(dlm, dead_node);
+		if (status < 0) {
+			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+			     "retrying\n", dlm->name);
+			msleep(1000);
+		}
+	} while (status != 0);
 
 	/* safe to access the node data list without a lock, since this
 	 * process is the only one to change the list */
@@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			continue;
 		}
 
-		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
-		if (status < 0) {
-			mlog_errno(status);
-			if (dlm_is_host_down(status))
-				ndata->state = DLM_RECO_NODE_DATA_DEAD;
-			else {
-				destroy = 1;
-				goto leave;
+		do {
+			status = dlm_request_all_locks(dlm, ndata->node_num,
+						       dead_node);
+			if (status < 0) {
+				mlog_errno(status);
+				if (dlm_is_host_down(status)) {
+					/* node died, ignore it for recovery */
+					status = 0;
+					ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					/* wait for the domain map to catch up
+					 * with the network state. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
+				} else {
+					/* -ENOMEM on the other node */
+					mlog(0, "%s: node %u returned "
+					     "%d during recovery, retrying "
+					     "after a short wait\n",
+					     dlm->name, ndata->node_num,
+					     status);
+					msleep(100);
+				}
 			}
-		}
+		} while (status != 0);
 
 		switch (ndata->state) {
 			case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 				mlog(0, "node %u died after requesting "
 				     "recovery info for node %u\n",
 				     ndata->node_num, dead_node);
-				// start all over
-				destroy = 1;
-				status = -EAGAIN;
-				goto leave;
+				/* fine.  don't need this node's info.
+				 * continue without it. */
+				break;
 			case DLM_RECO_NODE_DATA_REQUESTING:
 				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
 				mlog(0, "now receiving recovery data from "
@@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 					BUG();
 					break;
 				case DLM_RECO_NODE_DATA_DEAD:
-					mlog(ML_NOTICE, "node %u died after "
+					mlog(0, "node %u died after "
 					     "requesting recovery info for "
 					     "node %u\n", ndata->node_num,
 					     dead_node);
-					spin_unlock(&dlm_reco_state_lock);
-					// start all over
-					destroy = 1;
-					status = -EAGAIN;
-					/* instead of spinning like crazy here,
-					 * wait for the domain map to catch up
-					 * with the network state.  otherwise this
-					 * can be hit hundreds of times before
-					 * the node is really seen as dead. */
-					wait_event_timeout(dlm->dlm_reco_thread_wq,
-							   dlm_is_node_dead(dlm,
-								ndata->node_num),
-							   msecs_to_jiffies(1000));
-					mlog(0, "waited 1 sec for %u, "
-					     "dead? %s\n", ndata->node_num,
-					     dlm_is_node_dead(dlm, ndata->node_num) ?
-					     "yes" : "no");
-					goto leave;
+					break;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
+					mlog(0, "%s: node %u still in state %s\n",
+					     dlm->name, ndata->node_num,
+					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+					     "receiving" : "requested");
 					all_nodes_done = 0;
 					break;
 				case DLM_RECO_NODE_DATA_DONE:
+					mlog(0, "%s: node %u state is done\n",
+					     dlm->name, ndata->node_num);
 					break;
 				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					mlog(0, "%s: node %u state is finalize\n",
+					     dlm->name, ndata->node_num);
 					break;
 			}
 		}
@@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			     jiffies, dlm->reco.dead_node,
 			     dlm->node_num, dlm->reco.new_master);
 			destroy = 1;
-			status = ret;
+			status = 0;
 			/* rescan everything marked dirty along the way */
 			dlm_kick_thread(dlm, NULL);
 			break;
@@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 
 	}
 
-leave:
 	if (destroy)
 		dlm_destroy_recovery_area(dlm, dead_node);
 
@@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
 		}
 		BUG_ON(num == dead_node);
 
-		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
 		if (!ndata) {
 			dlm_destroy_recovery_area(dlm, dead_node);
 			return -ENOMEM;
@@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return -EINVAL;
 
+	if (lr->dead_node != dlm->reco.dead_node) {
+		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+		     "dead_node is %u\n", dlm->name, lr->node_idx,
+		     lr->dead_node, dlm->reco.dead_node);
+		dlm_print_reco_node_status(dlm);
+		/* this is a hack */
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
 	BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item) {
 		dlm_put(dlm);
 		return -ENOMEM;
 	}
 
 	/* this will get freed by dlm_request_all_locks_worker */
-	buf = (char *) __get_free_page(GFP_KERNEL);
+	buf = (char *) __get_free_page(GFP_NOFS);
 	if (!buf) {
 		kfree(item);
 		dlm_put(dlm);
@@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 	dlm_put(dlm);
 	return 0;
@@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	struct list_head *iter;
 	int ret;
 	u8 dead_node, reco_master;
+	int skip_all_done = 0;
 
 	dlm = item->dlm;
 	dead_node = item->u.ral.dead_node;
 	reco_master = item->u.ral.reco_master;
 	mres = (struct dlm_migratable_lockres *)data;
 
+	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+	     dlm->name, dead_node, reco_master);
+
 	if (dead_node != dlm->reco.dead_node ||
 	    reco_master != dlm->reco.new_master) {
-		/* show extra debug info if the recovery state is messed */
-		mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
-		     "request(dead=%u, master=%u)\n",
-		     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
-		     dead_node, reco_master);
-		mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-		     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
-		     dlm->name, mres->lockname_len, mres->lockname, mres->master,
-		     mres->num_locks, mres->total_locks, mres->flags,
-		     dlm_get_lock_cookie_node(mres->ml[0].cookie),
-		     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
-		     mres->ml[0].list, mres->ml[0].flags,
-		     mres->ml[0].type, mres->ml[0].convert_type,
-		     mres->ml[0].highest_blocked, mres->ml[0].node);
-		BUG();
+		/* worker could have been created before the recovery master
+		 * died.  if so, do not continue, but do not error. */
+		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+			mlog(ML_NOTICE, "%s: will not send recovery state, "
+			     "recovery master %u died, thread=(dead=%u,mas=%u)"
+			     " current=(dead=%u,mas=%u)\n", dlm->name,
+			     reco_master, dead_node, reco_master,
+			     dlm->reco.dead_node, dlm->reco.new_master);
+		} else {
+			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+			     "master=%u), request(dead=%u, master=%u)\n",
+			     dlm->name, dlm->reco.dead_node,
+			     dlm->reco.new_master, dead_node, reco_master);
+		}
+		goto leave;
 	}
-	BUG_ON(dead_node != dlm->reco.dead_node);
-	BUG_ON(reco_master != dlm->reco.new_master);
 
 	/* lock resources should have already been moved to the
  	 * dlm->reco.resources list.  now move items from that list
@@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
 
 	/* now we can begin blasting lockreses without the dlm lock */
+
+	/* any errors returned will be due to the new_master dying,
+	 * the dlm_reco_thread should detect this */
 	list_for_each(iter, &resources) {
 		res = list_entry (iter, struct dlm_lock_resource, recovering);
 		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
 				   	DLM_MRES_RECOVERY);
-		if (ret < 0)
-			mlog_errno(ret);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery state for dead node %u, ret=%d\n", dlm->name,
+			     reco_master, dead_node, ret);
+			skip_all_done = 1;
+			break;
+		}
 	}
 
 	/* move the resources back to the list */
@@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 	list_splice_init(&resources, &dlm->reco.resources);
 	spin_unlock(&dlm->spinlock);
 
-	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
-	if (ret < 0)
-		mlog_errno(ret);
-
+	if (!skip_all_done) {
+		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery all-done for dead node %u, ret=%d\n",
+			     dlm->name, reco_master, dead_node, ret);
+		}
+	}
+leave:
 	free_page((unsigned long)data);
 }
 
@@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
 
 	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
 				 sizeof(done_msg), send_to, &tmpret);
-	/* negative status is ignored by the caller */
-	if (ret >= 0)
+	if (ret < 0) {
+		if (!dlm_is_host_down(ret)) {
+			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: unknown error sending data-done "
+			     "to %u\n", dlm->name, send_to);
+			BUG();
+		}
+	} else
 		ret = tmpret;
 	return ret;
 }
@@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
 	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
 	     "node_idx=%u, this node=%u\n", done->dead_node,
 	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
-	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+			"node_idx=%u, this node=%u\n", done->dead_node,
+			dlm->reco.dead_node, done->node_idx, dlm->node_num);
 
 	spin_lock(&dlm_reco_state_lock);
 	list_for_each(iter, &dlm->reco.node_data) {
@@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
 			mlog(0, "found lockres owned by dead node while "
 				  "doing recovery for node %u. sending it.\n",
 				  dead_node);
-			list_del_init(&res->recovering);
-			list_add_tail(&res->recovering, list);
+			list_move_tail(&res->recovering, list);
 		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
 			mlog(0, "found UNKNOWN owner while doing recovery "
 				  "for node %u. sending it.\n", dead_node);
-			list_del_init(&res->recovering);
-			list_add_tail(&res->recovering, list);
+			list_move_tail(&res->recovering, list);
 		}
 	}
 	spin_unlock(&dlm->spinlock);
@@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
 		    ml->type == LKM_PRMODE) {
 			/* if it is already set, this had better be a PR
 			 * and it has to match */
-			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
-			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+			if (!dlm_lvb_is_empty(mres->lvb) &&
+			    (ml->type == LKM_EXMODE ||
+			     memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
 				mlog(ML_ERROR, "mismatched lvbs!\n");
 				__dlm_print_one_lock_resource(lock->lockres);
 				BUG();
@@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 			 * we must send it immediately. */
 			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
 						       res, total_locks);
-			if (ret < 0) {
-				// TODO
-				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
-				     "returned %d, TODO\n", ret);
-				BUG();
-			}
+			if (ret < 0)
+				goto error;
 		}
 	}
 	/* flush any remaining locks */
 	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
-	if (ret < 0) {
-		// TODO
-		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
-		     "TODO\n", ret);
+	if (ret < 0)
+		goto error;
+	return ret;
+
+error:
+	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+	     dlm->name, ret);
+	if (!dlm_is_host_down(ret))
 		BUG();
-	}
+	mlog(0, "%s: node %u went down while sending %s "
+	     "lockres %.*s\n", dlm->name, send_to,
+	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+	     res->lockname.len, res->lockname.name);
 	return ret;
 }
 
@@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
 		mlog(0, "all done flag.  all lockres data received!\n");
 
 	ret = -ENOMEM;
-	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!buf || !item)
 		goto leave;
 
@@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 leave:
 	dlm_put(dlm);
@@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
 	struct dlm_ctxt *dlm = data;
 	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
 
@@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
 		return master;
 	}
 
+	hash = dlm_lockid_hash(req->name, req->namelen);
+
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		master = res->owner;
@@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 	struct dlm_lock *newlock = NULL;
 	struct dlm_lockstatus *lksb = NULL;
 	int ret = 0;
-	int i;
+	int i, bad;
 	struct list_head *iter;
 	struct dlm_lock *lock = NULL;
 
@@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 
 			/* move the lock to its proper place */
 			/* do not alter lock refcount.  switching lists. */
-			list_del_init(&lock->list);
-			list_add_tail(&lock->list, queue);
+			list_move_tail(&lock->list, queue);
 			spin_unlock(&res->spinlock);
 
 			mlog(0, "just reordered a local lock!\n");
@@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		}
 		lksb->flags |= (ml->flags &
 				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-			
-		if (mres->lvb[0]) {
+
+		if (ml->type == LKM_NLMODE)
+			goto skip_lvb;
+
+		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
 				 * lvb when node died.  recreate the
 				 * lksb with the updated lvb. */
 				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+				/* the lock resource lvb update must happen
+				 * NOW, before the spinlock is dropped.
+				 * we no longer wait for the AST to update
+				 * the lvb. */
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			} else {
 				/* otherwise, the node is sending its 
 				 * most recent valid lvb info */
 				BUG_ON(ml->type != LKM_EXMODE &&
 				       ml->type != LKM_PRMODE);
-				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
-				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-					mlog(ML_ERROR, "received bad lvb!\n");
-					__dlm_print_one_lock_resource(res);
-					BUG();
+				if (!dlm_lvb_is_empty(res->lvb) &&
+ 				    (ml->type == LKM_EXMODE ||
+ 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ 					int i;
+ 					mlog(ML_ERROR, "%s:%.*s: received bad "
+ 					     "lvb! type=%d\n", dlm->name,
+ 					     res->lockname.len,
+ 					     res->lockname.name, ml->type);
+ 					printk("lockres lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", res->lvb[i]);
+ 					printk("]\nmigrated lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", mres->lvb[i]);
+ 					printk("]\n");
+ 					dlm_print_one_lock_resource(res);
+ 					BUG();
 				}
 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			}
 		}
-
+skip_lvb:
 
 		/* NOTE:
 		 * wrt lock queue ordering and recovery:
@@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 		 * relative to each other, but clearly *not*
 		 * preserved relative to locks from other nodes.
 		 */
+		bad = 0;
 		spin_lock(&res->spinlock);
-		dlm_lock_get(newlock);
-		list_add_tail(&newlock->list, queue);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == ml->cookie) {
+				u64 c = lock->ml.cookie;
+				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+				     "exists on this lockres!\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     dlm_get_lock_cookie_node(c),
+				     dlm_get_lock_cookie_seq(c));
+
+				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+				     "node=%u, cookie=%u:%llu, queue=%d\n",
+	      			     ml->type, ml->convert_type, ml->node,
+				     dlm_get_lock_cookie_node(ml->cookie),
+				     dlm_get_lock_cookie_seq(ml->cookie),
+				     ml->list);
+
+				__dlm_print_one_lock_resource(res);
+				bad = 1;
+				break;
+			}
+		}
+		if (!bad) {
+			dlm_lock_get(newlock);
+			list_add_tail(&newlock->list, queue);
+		}
 		spin_unlock(&res->spinlock);
 	}
 	mlog(0, "done running all the locks\n");
@@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
 	struct dlm_lock *lock;
 
 	res->state |= DLM_LOCK_RES_RECOVERING;
-	if (!list_empty(&res->recovering))
+	if (!list_empty(&res->recovering)) {
+		mlog(0,
+		     "Recovering res %s:%.*s, is already on recovery list!\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 		list_del_init(&res->recovering);
+	}
+	/* We need to hold a reference while on the recovery list */
+	dlm_lockres_get(res);
 	list_add_tail(&res->recovering, &dlm->reco.resources);
 
 	/* find any pending locks and put them back on proper list */
@@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 			spin_lock(&res->spinlock);
 			dlm_change_lockres_owner(dlm, res, new_master);
 			res->state &= ~DLM_LOCK_RES_RECOVERING;
-			__dlm_dirty_lockres(dlm, res);
+			if (!__dlm_lockres_unused(res))
+				__dlm_dirty_lockres(dlm, res);
 			spin_unlock(&res->spinlock);
 			wake_up(&res->wq);
+			dlm_lockres_put(res);
 		}
 	}
 
@@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	 * the RECOVERING state and set the owner
 	 * if necessary */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
 			if (res->state & DLM_LOCK_RES_RECOVERING) {
 				if (res->owner == dead_node) {
@@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 					     dlm->name, res->lockname.len,
 					     res->lockname.name, res->owner);
 					list_del_init(&res->recovering);
+					dlm_lockres_put(res);
 				}
 				spin_lock(&res->spinlock);
 				dlm_change_lockres_owner(dlm, res, new_master);
 				res->state &= ~DLM_LOCK_RES_RECOVERING;
-				__dlm_dirty_lockres(dlm, res);
+				if (!__dlm_lockres_unused(res))
+					__dlm_dirty_lockres(dlm, res);
 				spin_unlock(&res->spinlock);
 				wake_up(&res->wq);
 			}
@@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 	 *    need to be fired as a result.
 	 */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
@@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 {
 	assert_spin_locked(&dlm->spinlock);
 
+	if (dlm->reco.new_master == idx) {
+		mlog(0, "%s: recovery master %d just died\n",
+		     dlm->name, idx);
+		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+			/* finalize1 was reached, so it is safe to clear
+			 * the new_master and dead_node.  that recovery
+			 * is complete. */
+			mlog(0, "%s: dead master %d had reached "
+			     "finalize1 state, clearing\n", dlm->name, idx);
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+		}
+	}
+
 	/* check to see if the node is already considered dead */
 	if (!test_bit(idx, dlm->live_nodes_map)) {
 		mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2329,7 @@ again:
 
 			/* set the new_master to this node */
 			spin_lock(&dlm->spinlock);
-			dlm->reco.new_master = dlm->node_num;
+			dlm_set_reco_master(dlm, dlm->node_num);
 			spin_unlock(&dlm->spinlock);
 		}
 
@@ -2125,6 +2367,10 @@ again:
 		mlog(0, "%s: reco master %u is ready to recover %u\n",
 		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
 		status = -EEXIST;
+	} else if (ret == DLM_RECOVERING) {
+		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		goto again;
 	} else {
 		struct dlm_lock_resource *res;
 
@@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 
 	mlog_entry("%u\n", dead_node);
 
-	mlog(0, "dead node is %u\n", dead_node);
+	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2460,14 @@ retry:
 			 * another ENOMEM */
 			msleep(100);
 			goto retry;
+		} else if (ret == EAGAIN) {
+			mlog(0, "%s: trying to start recovery of node "
+			     "%u, but node %u is waiting for last recovery "
+			     "to complete, backoff for a bit\n", dlm->name,
+			     dead_node, nodenum);
+			/* TODO Look into replacing msleep with cond_resched() */
+			msleep(100);
+			goto retry;
 		}
 	}
 
@@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u wants to recover node %u\n",
-		  br->node_idx, br->dead_node);
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+		     "but this node is in finalize state, waiting on finalize2\n",
+		     dlm->name, br->node_idx, br->dead_node,
+		     dlm->reco.dead_node, dlm->reco.new_master);
+		spin_unlock(&dlm->spinlock);
+		return EAGAIN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
 
 	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
 
@@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 		     "node %u changing it to %u\n", dlm->name, 
 		     dlm->reco.dead_node, br->node_idx, br->dead_node);
 	}
-	dlm->reco.new_master = br->node_idx;
-	dlm->reco.dead_node = br->dead_node;
+	dlm_set_reco_master(dlm, br->node_idx);
+	dlm_set_reco_dead_node(dlm, br->dead_node);
 	if (!test_bit(br->dead_node, dlm->recovery_map)) {
 		mlog(0, "recovery master %u sees %u as dead, but this "
 		     "node has not yet.  marking %u as dead\n",
@@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 	spin_unlock(&dlm->spinlock);
 
 	dlm_kick_recovery_thread(dlm);
+
+	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
 	dlm_put(dlm);
 	return 0;
 }
 
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
 	int ret = 0;
@@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 	struct dlm_node_iter iter;
 	int nodenum;
 	int status;
+	int stage = 1;
 
-	mlog(0, "finishing recovery for node %s:%u\n",
-	     dlm->name, dlm->reco.dead_node);
+	mlog(0, "finishing recovery for node %s:%u, "
+	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
 	spin_unlock(&dlm->spinlock);
 
+stage2:
 	memset(&fr, 0, sizeof(fr));
 	fr.node_idx = dlm->node_num;
 	fr.dead_node = dlm->reco.dead_node;
+	if (stage == 2)
+		fr.flags |= DLM_FINALIZE_STAGE2;
 
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 		if (nodenum == dlm->node_num)
 			continue;
 		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
 					 &fr, sizeof(fr), nodenum, &status);
-		if (ret >= 0) {
+		if (ret >= 0)
 			ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
 			if (dlm_is_host_down(ret)) {
 				/* this has no effect on this recovery 
 				 * session, so set the status to zero to 
@@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 				mlog(ML_ERROR, "node %u went down after this "
 				     "node finished recovery.\n", nodenum);
 				ret = 0;
+				continue;
 			}
-		}
-		if (ret < 0) {
-			mlog_errno(ret);
 			break;
 		}
 	}
+	if (stage == 1) {
+		/* reset the node_iter back to the top and send finalize2 */
+		iter.curnode = -1;
+		stage = 2;
+		goto stage2;
+	}
 
 	return ret;
 }
@@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 {
 	struct dlm_ctxt *dlm = data;
 	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+	int stage = 1;
 
 	/* ok to return 0, domain has gone away */
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u finalizing recovery of node %u\n",
-	     fr->node_idx, fr->dead_node);
+	if (fr->flags & DLM_FINALIZE_STAGE2)
+		stage = 2;
 
+	mlog(0, "%s: node %u finalizing recovery stage%d of "
+	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+ 
 	spin_lock(&dlm->spinlock);
 
 	if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 		BUG();
 	}
 
-	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
-	spin_unlock(&dlm->spinlock);
+	switch (stage) {
+		case 1:
+			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+				mlog(ML_ERROR, "%s: received finalize1 from "
+				     "new master %u for dead node %u, but "
+				     "this node has already received it!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			break;
+		case 2:
+			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+				mlog(ML_ERROR, "%s: received finalize2 from "
+				     "new master %u for dead node %u, but "
+				     "this node did not have finalize1!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			dlm_reset_recovery(dlm);
+			dlm_kick_recovery_thread(dlm);
+			break;
+		default:
+			BUG();
+	}
 
-	dlm_reset_recovery(dlm);
+	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
 
-	dlm_kick_recovery_thread(dlm);
 	dlm_put(dlm);
 	return 0;
 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12c..0c822f3ffb0 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 
 #include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@
 #include "cluster/masklog.h"
 
 static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres);
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
 
@@ -80,7 +83,7 @@ repeat:
 }
 
 
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
 	if (list_empty(&res->granted) &&
 	    list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 	assert_spin_locked(&res->spinlock);
 
 	if (__dlm_lockres_unused(res)){
+		/* For now, just keep any resource we master */
+		if (res->owner == dlm->node_num)
+		{
+			if (!list_empty(&res->purge)) {
+				mlog(0, "we master %s:%.*s, but it is on "
+				     "the purge list.  Removing\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name);
+				list_del_init(&res->purge);
+				dlm->purge_count--;
+			}
+			return;
+		}
+
 		if (list_empty(&res->purge)) {
 			mlog(0, "putting lockres %.*s from purge list\n",
 			     res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
 			res->last_used = jiffies;
 			list_add_tail(&res->purge, &dlm->purge_list);
 			dlm->purge_count++;
+
+			/* if this node is not the owner, there is
+			 * no way to keep track of who the owner could be.
+			 * unhash it to avoid serious problems. */
+			if (res->owner != dlm->node_num) {
+				mlog(0, "%s:%.*s: doing immediate "
+				     "purge of lockres owned by %u\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, res->owner);
+
+				dlm_purge_lockres_now(dlm, res);
+			}
 		}
 	} else if (!list_empty(&res->purge)) {
-		mlog(0, "removing lockres %.*s from purge list\n",
-		     res->lockname.len, res->lockname.name);
+		mlog(0, "removing lockres %.*s from purge list, "
+		     "owner=%u\n", res->lockname.len, res->lockname.name,
+		     res->owner);
 
 		list_del_init(&res->purge);
 		dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
 	} else if (ret < 0) {
 		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
 		     lockres->lockname.len, lockres->lockname.name);
+		msleep(100);
 		goto again;
 	}
 
@@ -178,6 +209,24 @@ finish:
 	__dlm_unhash_lockres(lockres);
 }
 
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&lockres->spinlock);
+
+	BUG_ON(!__dlm_lockres_unused(lockres));
+
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			       int purge_now)
 {
@@ -318,8 +367,7 @@ converting:
 
 		target->ml.type = target->ml.convert_type;
 		target->ml.convert_type = LKM_IVMODE;
-		list_del_init(&target->list);
-		list_add_tail(&target->list, &res->granted);
+		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
 		target->lksb->status = DLM_NORMAL;
@@ -380,8 +428,7 @@ blocked:
 		     target->ml.type, target->ml.node);
 
 		// target->ml.type is already correct
-		list_del_init(&target->list);
-		list_add_tail(&target->list, &res->granted);
+		list_move_tail(&target->list, &res->granted);
 
 		BUG_ON(!target->lksb);
 		target->lksb->status = DLM_NORMAL;
@@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
 	/* don't shuffle secondary queues */
 	if ((res->owner == dlm->node_num) &&
 	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		/* ref for dirty_list */
+		dlm_lockres_get(res);
 		list_add_tail(&res->dirty, &dlm->dirty_list);
 		res->state |= DLM_LOCK_RES_DIRTY;
 	}
@@ -606,6 +655,8 @@ static int dlm_thread(void *data)
 			list_del_init(&res->dirty);
 			spin_unlock(&res->spinlock);
 			spin_unlock(&dlm->spinlock);
+			/* Drop dirty_list ref */
+			dlm_lockres_put(res);
 
 		 	/* lockres can be re-dirtied/re-added to the
 			 * dirty_list in this gap, but that is ok */
@@ -642,8 +693,9 @@ static int dlm_thread(void *data)
 			 * spinlock and do NOT have the dlm lock.
 			 * safe to reserve/queue asts and run the lists. */
 
-			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
-			     "res=%p\n", dlm, res);
+			mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
+			     "res=%.*s\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
 
 			/* called while holding lockres lock */
 			dlm_shuffle_lists(dlm, res);
@@ -657,6 +709,8 @@ in_progress:
 			/* if the lock was in-progress, stick
 			 * it on the back of the list */
 			if (delay) {
+				/* ref for dirty_list */
+				dlm_lockres_get(res);
 				spin_lock(&res->spinlock);
 				list_add_tail(&res->dirty, &dlm->dirty_list);
 				res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +731,7 @@ in_progress:
 
 		/* yield and continue right away if there is more work to do */
 		if (!n) {
-			yield();
+			cond_resched();
 			continue;
 		}
 
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a2754267..b0c3134f4f7 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
 void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
 			       struct dlm_lock *lock)
 {
-	list_del_init(&lock->list);
-	list_add_tail(&lock->list, &res->granted);
+	list_move_tail(&lock->list, &res->granted);
 	lock->ml.convert_type = LKM_IVMODE;
 }
 
@@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 
 	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
 
+	if (owner == dlm->node_num) {
+		/* ended up trying to contact ourself.  this means
+		 * that the lockres had been remote but became local
+		 * via a migration.  just retry it, now as local */
+		mlog(0, "%s:%.*s: this node became the master due to a "
+		     "migration, re-evaluate now\n", dlm->name,
+		     res->lockname.len, res->lockname.name);
+		return DLM_FORWARD;
+	}
+
 	memset(&unlock, 0, sizeof(unlock));
 	unlock.node_idx = dlm->node_num;
 	unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f976..e641b084b34 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
 	u32 dlm_key;
 	char *domain;
 
-	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	domain = kmalloc(name->len + 1, GFP_NOFS);
 	if (!domain) {
 		mlog_errno(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be..3fe8781c22c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
 	BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
 
 	OCFS2_I(inode)->ip_handle = handle;
-	list_del(&(OCFS2_I(inode)->ip_handle_list));
-	list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+	list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
 }
 
 static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index efc7c91128a..93a56bd4a2b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
-/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $
- * openpromfs.c: /proc/openprom handling routines
+/* inode.c: /proc/openprom handling routines
  *
  * Copyright (C) 1996-1999 Jakub Jelinek  (jakub@redhat.com)
  * Copyright (C) 1998      Eddie C. Dost  (ecd@skynet.be)
@@ -12,762 +11,245 @@
 #include <linux/openprom_fs.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
 
 #include <asm/openprom.h>
 #include <asm/oplib.h>
+#include <asm/prom.h>
 #include <asm/uaccess.h>
 
-#define ALIASES_NNODES 64
-
-typedef struct {
-	u16	parent;
-	u16	next;
-	u16	child;
-	u16	first_prop;
-	u32	node;
-} openpromfs_node;
-
-typedef struct {
-#define OPP_STRING	0x10
-#define OPP_STRINGLIST	0x20
-#define OPP_BINARY	0x40
-#define OPP_HEXSTRING	0x80
-#define OPP_DIRTY	0x01
-#define OPP_QUOTED	0x02
-#define OPP_NOTQUOTED	0x04
-#define OPP_ASCIIZ	0x08
-	u32	flag;
-	u32	alloclen;
-	u32	len;
-	char	*value;
-	char	name[8];
-} openprom_property;
-
-static openpromfs_node *nodes;
-static int alloced;
-static u16 last_node;
-static u16 first_prop;
-static u16 options = 0xffff;
-static u16 aliases = 0xffff;
-static int aliases_nodes;
-static char *alias_names [ALIASES_NNODES];
-
-#define OPENPROM_ROOT_INO	16
-#define OPENPROM_FIRST_INO	OPENPROM_ROOT_INO
-#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
-#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
-#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
-
-static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
-static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
-static int openpromfs_unlink (struct inode *, struct dentry *dentry);
+static DEFINE_MUTEX(op_mutex);
 
-static inline u16 ptr_nod(void *p)
-{
-    return (long)p & 0xFFFF;
-}
+#define OPENPROM_ROOT_INO	0
 
-static ssize_t nodenum_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
+enum op_inode_type {
+	op_inode_node,
+	op_inode_prop,
+};
+
+union op_inode_data {
+	struct device_node	*node;
+	struct property		*prop;
+};
+
+struct op_inode_info {
+	struct inode		vfs_inode;
+	enum op_inode_type	type;
+	union op_inode_data	u;
+};
+
+static inline struct op_inode_info *OP_I(struct inode *inode)
 {
-	struct inode *inode = file->f_dentry->d_inode;
-	char buffer[10];
-	
-	if (count < 0 || !inode->u.generic_ip)
-		return -EINVAL;
-	sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip);
-	if (file->f_pos >= 9)
-		return 0;
-	if (count > 9 - file->f_pos)
-		count = 9 - file->f_pos;
-	if (copy_to_user(buf, buffer + file->f_pos, count))
-		return -EFAULT;
-	*ppos += count;
-	return count;
+	return container_of(inode, struct op_inode_info, vfs_inode);
 }
 
-static ssize_t property_read(struct file *filp, char __user *buf,
-			     size_t count, loff_t *ppos)
+static int is_string(unsigned char *p, int len)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	int i, j, k;
-	u32 node;
-	char *p, *s;
-	u32 *q;
-	openprom_property *op;
-	char buffer[64];
-	
-	if (!filp->private_data) {
-		node = nodes[ptr_nod(inode->u.generic_ip)].node;
-		i = ((u32)(long)inode->u.generic_ip) >> 16;
-		if (ptr_nod(inode->u.generic_ip) == aliases) {
-			if (i >= aliases_nodes)
-				p = NULL;
-			else
-				p = alias_names [i];
-		} else
-			for (p = prom_firstprop (node, buffer);
-			     i && p && *p;
-			     p = prom_nextprop (node, p, buffer), i--)
-				/* nothing */ ;
-		if (!p || !*p)
-			return -EIO;
-		i = prom_getproplen (node, p);
-		if (i < 0) {
-			if (ptr_nod(inode->u.generic_ip) == aliases)
-				i = 0;
-			else
-				return -EIO;
-		}
-		k = i;
-		if (i < 64) i = 64;
-		filp->private_data = kmalloc (sizeof (openprom_property)
-					      + (j = strlen (p)) + 2 * i,
-					      GFP_KERNEL);
-		if (!filp->private_data)
-			return -ENOMEM;
-		op = filp->private_data;
-		op->flag = 0;
-		op->alloclen = 2 * i;
-		strcpy (op->name, p);
-		op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
-		op->len = k;
-		if (k && prom_getproperty (node, p, op->value, i) < 0)
-			return -EIO;
-		op->value [k] = 0;
-		if (k) {
-			for (s = NULL, p = op->value; p < op->value + k; p++) {
-				if ((*p >= ' ' && *p <= '~') || *p == '\n') {
-					op->flag |= OPP_STRING;
-					s = p;
-					continue;
-				}
-				if (p > op->value && !*p && s == p - 1) {
-					if (p < op->value + k - 1)
-						op->flag |= OPP_STRINGLIST;
-					else
-						op->flag |= OPP_ASCIIZ;
-					continue;
-				}
-				if (k == 1 && !*p) {
-					op->flag |= (OPP_STRING|OPP_ASCIIZ);
-					break;
-				}
-				op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
-				if (k & 3)
-					op->flag |= OPP_HEXSTRING;
-				else
-					op->flag |= OPP_BINARY;
-				break;
-			}
-			if (op->flag & OPP_STRINGLIST)
-				op->flag &= ~(OPP_STRING);
-			if (op->flag & OPP_ASCIIZ)
-				op->len--;
-		}
-	} else
-		op = filp->private_data;
-	if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
-		return 0;
-	if (*ppos >= 0xffffff || count >= 0xffffff)
-		return -EINVAL;
-	if (op->flag & OPP_STRINGLIST) {
-		for (k = 0, p = op->value; p < op->value + op->len; p++)
-			if (!*p)
-				k++;
-		i = op->len + 4 * k + 3;
-	} else if (op->flag & OPP_STRING) {
-		i = op->len + 3;
-	} else if (op->flag & OPP_BINARY) {
-		i = (op->len * 9) >> 2;
-	} else {
-		i = (op->len << 1) + 1;
-	}
-	k = *ppos;
-	if (k >= i) return 0;
-	if (count > i - k) count = i - k;
-	if (op->flag & OPP_STRING) {
-		if (!k) {
-			if (put_user('\'', buf))
-				return -EFAULT;
-			k++;
-			count--;
-		}
+	int i;
 
-		if (k + count >= i - 2)
-			j = i - 2 - k;
-		else
-			j = count;
-
-		if (j >= 0) {
-			if (copy_to_user(buf + k - *ppos,
-					 op->value + k - 1, j))
-				return -EFAULT;
-			count -= j;
-			k += j;
-		}
+	for (i = 0; i < len; i++) {
+		unsigned char val = p[i];
 
-		if (count) {
-			if (put_user('\'', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-		if (count > 1) {
-			if (put_user('\n', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-	} else if (op->flag & OPP_STRINGLIST) {
-		char *tmp;
-
-		tmp = kmalloc (i, GFP_KERNEL);
-		if (!tmp)
-			return -ENOMEM;
-
-		s = tmp;
-		*s++ = '\'';
-		for (p = op->value; p < op->value + op->len; p++) {
-			if (!*p) {
-				strcpy(s, "' + '");
-				s += 5;
-				continue;
-			}
-			*s++ = *p;
-		}
-		strcpy(s, "'\n");
-
-		if (copy_to_user(buf, tmp + k, count))
-			return -EFAULT;
-
-		kfree(tmp);
-		k += count;
-
-	} else if (op->flag & OPP_BINARY) {
-		char buffer[10];
-		u32 *first, *last;
-		int first_off, last_cnt;
-
-		first = ((u32 *)op->value) + k / 9;
-		first_off = k % 9;
-		last = ((u32 *)op->value) + (k + count - 1) / 9;
-		last_cnt = (k + count) % 9;
-		if (!last_cnt) last_cnt = 9;
-
-		if (first == last) {
-			sprintf (buffer, "%08x.", *first);
-			if (copy_to_user(buf, buffer + first_off,
-					 last_cnt - first_off))
-				return -EFAULT;
-			buf += last_cnt - first_off;
-		} else {		
-			for (q = first; q <= last; q++) {
-				sprintf (buffer, "%08x.", *q);
-				if (q == first) {
-					if (copy_to_user(buf, buffer + first_off,
-							 9 - first_off))
-						return -EFAULT;
-					buf += 9 - first_off;
-				} else if (q == last) {
-					if (copy_to_user(buf, buffer, last_cnt))
-						return -EFAULT;
-					buf += last_cnt;
-				} else {
-					if (copy_to_user(buf, buffer, 9))
-						return -EFAULT;
-					buf += 9;
-				}
-			}
-		}
+		if ((i && !val) ||
+		    (val >= ' ' && val <= '~'))
+			continue;
 
-		if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) {
-			if (put_user('\n', (buf - 1)))
-				return -EFAULT;
-		}
+		return 0;
+	}
 
-		k += count;
+	return 1;
+}
 
-	} else if (op->flag & OPP_HEXSTRING) {
-		char buffer[3];
+static int property_show(struct seq_file *f, void *v)
+{
+	struct property *prop = f->private;
+	void *pval;
+	int len;
 
-		if ((k < i - 1) && (k & 1)) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (put_user(buffer[1], &buf[k++ - *ppos]))
-				return -EFAULT;
-			count--;
-		}
+	len = prop->length;
+	pval = prop->value;
 
-		for (; (count > 1) && (k < i - 1); k += 2) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (copy_to_user(buf + k - *ppos, buffer, 2))
-				return -EFAULT;
-			count -= 2;
-		}
+	if (is_string(pval, len)) {
+		while (len > 0) {
+			int n = strlen(pval);
 
-		if (count && (k < i - 1)) {
-			sprintf (buffer, "%02x",
-				 (unsigned char) *(op->value + (k >> 1)) & 0xff);
-			if (put_user(buffer[0], &buf[k++ - *ppos]))
-				return -EFAULT;
-			count--;
-		}
+			seq_printf(f, "%s", (char *) pval);
 
-		if (count) {
-			if (put_user('\n', &buf [k++ - *ppos]))
-				return -EFAULT;
-		}
-	}
-	count = k - *ppos;
-	*ppos = k;
-	return count;
-}
+			/* Skip over the NULL byte too.  */
+			pval += n + 1;
+			len -= n + 1;
 
-static ssize_t property_write(struct file *filp, const char __user *buf,
-			      size_t count, loff_t *ppos)
-{
-	int i, j, k;
-	char *p;
-	u32 *q;
-	void *b;
-	openprom_property *op;
-	
-	if (*ppos >= 0xffffff || count >= 0xffffff)
-		return -EINVAL;
-	if (!filp->private_data) {
-		i = property_read (filp, NULL, 0, NULL);
-		if (i)
-			return i;
-	}
-	k = *ppos;
-	op = filp->private_data;
-	if (!(op->flag & OPP_STRING)) {
-		u32 *first, *last;
-		int first_off, last_cnt;
-		u32 mask, mask2;
-		char tmp [9];
-		int forcelen = 0;
-		
-		j = k % 9;
-		for (i = 0; i < count; i++, j++) {
-			if (j == 9) j = 0;
-			if (!j) {
-				char ctmp;
-				if (get_user(ctmp, &buf[i]))
-					return -EFAULT;
-				if (ctmp != '.') {
-					if (ctmp != '\n') {
-						if (op->flag & OPP_BINARY)
-							return -EINVAL;
-						else
-							goto write_try_string;
-					} else {
-						count = i + 1;
-						forcelen = 1;
-						break;
-					}
-				}
-			} else {
-				char ctmp;
-				if (get_user(ctmp, &buf[i]))
-					return -EFAULT;
-				if (ctmp < '0' || 
-				    (ctmp > '9' && ctmp < 'A') ||
-				    (ctmp > 'F' && ctmp < 'a') ||
-				    ctmp > 'f') {
-					if (op->flag & OPP_BINARY)
-						return -EINVAL;
-					else
-						goto write_try_string;
-				}
-			}
-		}
-		op->flag |= OPP_BINARY;
-		tmp [8] = 0;
-		i = ((count + k + 8) / 9) << 2;
-		if (op->alloclen <= i) {
-			b = kmalloc (sizeof (openprom_property) + 2 * i,
-				     GFP_KERNEL);
-			if (!b)
-				return -ENOMEM;
-			memcpy (b, filp->private_data,
-				sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen);
-			memset (b + sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen, 
-				0, 2 * i - op->alloclen);
-			op = b;
-			op->alloclen = 2*i;
-			b = filp->private_data;
-			filp->private_data = op;
-			kfree (b);
+			if (len > 0)
+				seq_printf(f, " + ");
 		}
-		first = ((u32 *)op->value) + (k / 9);
-		first_off = k % 9;
-		last = (u32 *)(op->value + i);
-		last_cnt = (k + count) % 9;
-		if (first + 1 == last) {
-			memset (tmp, '0', 8);
-			if (copy_from_user(tmp + first_off, buf,
-					   (count + first_off > 8) ?
-					   8 - first_off : count))
-				return -EFAULT;
-			mask = 0xffffffff;
-			mask2 = 0xffffffff;
-			for (j = 0; j < first_off; j++)
-				mask >>= 1;
-			for (j = 8 - count - first_off; j > 0; j--)
-				mask2 <<= 1;
-			mask &= mask2;
-			if (mask) {
-				*first &= ~mask;
-				*first |= simple_strtoul (tmp, NULL, 16);
-				op->flag |= OPP_DIRTY;
+	} else {
+		if (len & 3) {
+			while (len) {
+				len--;
+				if (len)
+					seq_printf(f, "%02x.",
+						   *(unsigned char *) pval);
+				else
+					seq_printf(f, "%02x",
+						   *(unsigned char *) pval);
+				pval++;
 			}
 		} else {
-			op->flag |= OPP_DIRTY;
-			for (q = first; q < last; q++) {
-				if (q == first) {
-					if (first_off < 8) {
-						memset (tmp, '0', 8);
-						if (copy_from_user(tmp + first_off,
-								   buf,
-								   8 - first_off))
-							return -EFAULT;
-						mask = 0xffffffff;
-						for (j = 0; j < first_off; j++)
-							mask >>= 1;
-						*q &= ~mask;
-						*q |= simple_strtoul (tmp,NULL,16);
-					}
-					buf += 9;
-				} else if ((q == last - 1) && last_cnt
-					   && (last_cnt < 8)) {
-					memset (tmp, '0', 8);
-					if (copy_from_user(tmp, buf, last_cnt))
-						return -EFAULT;
-					mask = 0xffffffff;
-					for (j = 0; j < 8 - last_cnt; j++)
-						mask <<= 1;
-					*q &= ~mask;
-					*q |= simple_strtoul (tmp, NULL, 16);
-					buf += last_cnt;
-				} else {
-					char tchars[2 * sizeof(long) + 1];
-
-					if (copy_from_user(tchars, buf, sizeof(tchars) - 1))
-						return -EFAULT;
-                                        tchars[sizeof(tchars) - 1] = '\0';
-					*q = simple_strtoul (tchars, NULL, 16);
-					buf += 9;
-				}
-			}
-		}
-		if (!forcelen) {
-			if (op->len < i)
-				op->len = i;
-		} else
-			op->len = i;
-		*ppos += count;
-	}
-write_try_string:
-	if (!(op->flag & OPP_BINARY)) {
-		if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
-			char ctmp;
-
-			/* No way, if somebody starts writing from the middle, 
-			 * we don't know whether he uses quotes around or not 
-			 */
-			if (k > 0)
-				return -EINVAL;
-			if (get_user(ctmp, buf))
-				return -EFAULT;
-			if (ctmp == '\'') {
-				op->flag |= OPP_QUOTED;
-				buf++;
-				count--;
-				(*ppos)++;
-				if (!count) {
-					op->flag |= OPP_STRING;
-					return 1;
-				}
-			} else
-				op->flag |= OPP_NOTQUOTED;
-		}
-		op->flag |= OPP_STRING;
-		if (op->alloclen <= count + *ppos) {
-			b = kmalloc (sizeof (openprom_property)
-				     + 2 * (count + *ppos), GFP_KERNEL);
-			if (!b)
-				return -ENOMEM;
-			memcpy (b, filp->private_data,
-				sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen);
-			memset (b + sizeof (openprom_property)
-				+ strlen (op->name) + op->alloclen, 
-				0, 2*(count - *ppos) - op->alloclen);
-			op = b;
-			op->alloclen = 2*(count + *ppos);
-			b = filp->private_data;
-			filp->private_data = op;
-			kfree (b);
-		}
-		p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
-		if (copy_from_user(p, buf, count))
-			return -EFAULT;
-		op->flag |= OPP_DIRTY;
-		for (i = 0; i < count; i++, p++)
-			if (*p == '\n') {
-				*p = 0;
-				break;
+			while (len >= 4) {
+				len -= 4;
+
+				if (len)
+					seq_printf(f, "%08x.",
+						   *(unsigned int *) pval);
+				else
+					seq_printf(f, "%08x",
+						   *(unsigned int *) pval);
+				pval += 4;
 			}
-		if (i < count) {
-			op->len = p - op->value;
-			*ppos += i + 1;
-			if ((p > op->value) && (op->flag & OPP_QUOTED)
-			    && (*(p - 1) == '\''))
-				op->len--;
-		} else {
-			if (p - op->value > op->len)
-				op->len = p - op->value;
-			*ppos += count;
 		}
 	}
-	return *ppos - k;
+	seq_printf(f, "\n");
+
+	return 0;
 }
 
-int property_release (struct inode *inode, struct file *filp)
+static void *property_start(struct seq_file *f, loff_t *pos)
 {
-	openprom_property *op = filp->private_data;
-	int error;
-	u32 node;
-	
-	if (!op)
-		return 0;
-	lock_kernel();
-	node = nodes[ptr_nod(inode->u.generic_ip)].node;
-	if (ptr_nod(inode->u.generic_ip) == aliases) {
-		if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
-			char *p = op->name;
-			int i = (op->value - op->name) - strlen (op->name) - 1;
-			op->value [op->len] = 0;
-			*(op->value - 1) = ' ';
-			if (i) {
-				for (p = op->value - i - 2; p >= op->name; p--)
-					p[i] = *p;
-				p = op->name + i;
-			}
-			memcpy (p - 8, "nvalias ", 8);
-			prom_feval (p - 8);
-		}
-	} else if (op->flag & OPP_DIRTY) {
-		if (op->flag & OPP_STRING) {
-			op->value [op->len] = 0;
-			error = prom_setprop (node, op->name,
-					      op->value, op->len + 1);
-			if (error <= 0)
-				printk (KERN_WARNING "openpromfs: "
-					"Couldn't write property %s\n",
-					op->name);
-		} else if ((op->flag & OPP_BINARY) || !op->len) {
-			error = prom_setprop (node, op->name,
-					      op->value, op->len);
-			if (error <= 0)
-				printk (KERN_WARNING "openpromfs: "
-					"Couldn't write property %s\n",
-					op->name);
-		} else {
-			printk (KERN_WARNING "openpromfs: "
-				"Unknown property type of %s\n",
-				op->name);
-		}
+	if (*pos == 0)
+		return pos;
+	return NULL;
+}
+
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return NULL;
+}
+
+static void property_stop(struct seq_file *f, void *v)
+{
+	/* Nothing to do */
+}
+
+static struct seq_operations property_op = {
+	.start		= property_start,
+	.next		= property_next,
+	.stop		= property_stop,
+	.show		= property_show
+};
+
+static int property_open(struct inode *inode, struct file *file)
+{
+	struct op_inode_info *oi = OP_I(inode);
+	int ret;
+
+	BUG_ON(oi->type != op_inode_prop);
+
+	ret = seq_open(file, &property_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = oi->u.prop;
 	}
-	unlock_kernel();
-	kfree (filp->private_data);
-	return 0;
+	return ret;
 }
 
 static const struct file_operations openpromfs_prop_ops = {
-	.read		= property_read,
-	.write		= property_write,
-	.release	= property_release,
+	.open		= property_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
 };
 
-static const struct file_operations openpromfs_nodenum_ops = {
-	.read		= nodenum_read,
-};
+static int openpromfs_readdir(struct file *, void *, filldir_t);
 
 static const struct file_operations openprom_operations = {
 	.read		= generic_read_dir,
 	.readdir	= openpromfs_readdir,
 };
 
-static struct inode_operations openprom_alias_inode_operations = {
-	.create		= openpromfs_create,
-	.lookup		= openpromfs_lookup,
-	.unlink		= openpromfs_unlink,
-};
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 static struct inode_operations openprom_inode_operations = {
 	.lookup		= openpromfs_lookup,
 };
 
-static int lookup_children(u16 n, const char * name, int len)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	int ret;
-	u16 node;
-	for (; n != 0xffff; n = nodes[n].next) {
-		node = nodes[n].child;
-		if (node != 0xffff) {
-			char buffer[128];
-			int i;
-			char *p;
-			
-			while (node != 0xffff) {
-				if (prom_getname (nodes[node].node,
-						  buffer, 128) >= 0) {
-					i = strlen (buffer);
-					if ((len == i)
-					    && !strncmp (buffer, name, len))
-						return NODE2INO(node);
-					p = strchr (buffer, '@');
-					if (p && (len == p - buffer)
-					    && !strncmp (buffer, name, len))
-						return NODE2INO(node);
-				}
-				node = nodes[node].next;
-			}
-		} else
-			continue;
-		ret = lookup_children (nodes[n].child, name, len);
-		if (ret) return ret;
-	}
-	return 0;
-}
-
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
-{
-	int ino = 0;
-#define OPFSL_DIR	0
-#define OPFSL_PROPERTY	1
-#define OPFSL_NODENUM	2
-	int type = 0;
-	char buffer[128];
-	char *p;
+	struct op_inode_info *ent_oi, *oi = OP_I(dir);
+	struct device_node *dp, *child;
+	struct property *prop;
+	enum op_inode_type ent_type;
+	union op_inode_data ent_data;
 	const char *name;
-	u32 n;
-	u16 dirnode;
-	unsigned int len;
-	int i;
 	struct inode *inode;
-	char buffer2[64];
+	unsigned int ino;
+	int len;
 	
-	inode = NULL;
+	BUG_ON(oi->type != op_inode_node);
+
+	dp = oi->u.node;
+
 	name = dentry->d_name.name;
 	len = dentry->d_name.len;
-	lock_kernel();
-	if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) {
-		ino = NODEP2INO(NODE(dir->i_ino).first_prop);
-		type = OPFSL_NODENUM;
-	}
-	if (!ino) {
-		u16 node = NODE(dir->i_ino).child;
-		while (node != 0xffff) {
-			if (prom_getname (nodes[node].node, buffer, 128) >= 0) {
-				i = strlen (buffer);
-				if (len == i && !strncmp (buffer, name, len)) {
-					ino = NODE2INO(node);
-					type = OPFSL_DIR;
-					break;
-				}
-				p = strchr (buffer, '@');
-				if (p && (len == p - buffer)
-				    && !strncmp (buffer, name, len)) {
-					ino = NODE2INO(node);
-					type = OPFSL_DIR;
-					break;
-				}
-			}
-			node = nodes[node].next;
-		}
-	}
-	n = NODE(dir->i_ino).node;
-	dirnode = dir->i_ino - OPENPROM_FIRST_INO;
-	if (!ino) {
-		int j = NODEP2INO(NODE(dir->i_ino).first_prop);
-		if (dirnode != aliases) {
-			for (p = prom_firstprop (n, buffer2);
-			     p && *p;
-			     p = prom_nextprop (n, p, buffer2)) {
-				j++;
-				if ((len == strlen (p))
-				    && !strncmp (p, name, len)) {
-					ino = j;
-					type = OPFSL_PROPERTY;
-					break;
-				}
-			}
-		} else {
-			int k;
-			for (k = 0; k < aliases_nodes; k++) {
-				j++;
-				if (alias_names [k]
-				    && (len == strlen (alias_names [k]))
-				    && !strncmp (alias_names [k], name, len)) {
-					ino = j;
-					type = OPFSL_PROPERTY;
-					break;
-				}
-			}
+
+	mutex_lock(&op_mutex);
+
+	child = dp->child;
+	while (child) {
+		int n = strlen(child->path_component_name);
+
+		if (len == n &&
+		    !strncmp(child->path_component_name, name, len)) {
+			ent_type = op_inode_node;
+			ent_data.node = child;
+			ino = child->unique_id;
+			goto found;
 		}
+		child = child->sibling;
 	}
-	if (!ino) {
-		ino = lookup_children (NODE(dir->i_ino).child, name, len);
-		if (ino)
-			type = OPFSL_DIR;
-		else {
-			unlock_kernel();
-			return ERR_PTR(-ENOENT);
+
+	prop = dp->properties;
+	while (prop) {
+		int n = strlen(prop->name);
+
+		if (len == n && !strncmp(prop->name, name, len)) {
+			ent_type = op_inode_prop;
+			ent_data.prop = prop;
+			ino = prop->unique_id;
+			goto found;
 		}
+
+		prop = prop->next;
 	}
-	inode = iget (dir->i_sb, ino);
-	unlock_kernel();
+
+	mutex_unlock(&op_mutex);
+	return ERR_PTR(-ENOENT);
+
+found:
+	inode = iget(dir->i_sb, ino);
+	mutex_unlock(&op_mutex);
 	if (!inode)
 		return ERR_PTR(-EINVAL);
-	switch (type) {
-	case OPFSL_DIR:
+	ent_oi = OP_I(inode);
+	ent_oi->type = ent_type;
+	ent_oi->u = ent_data;
+
+	switch (ent_type) {
+	case op_inode_node:
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
-		if (ino == OPENPROM_FIRST_INO + aliases) {
-			inode->i_mode |= S_IWUSR;
-			inode->i_op = &openprom_alias_inode_operations;
-		} else
-			inode->i_op = &openprom_inode_operations;
+		inode->i_op = &openprom_inode_operations;
 		inode->i_fop = &openprom_operations;
 		inode->i_nlink = 2;
 		break;
-	case OPFSL_NODENUM:
-		inode->i_mode = S_IFREG | S_IRUGO;
-		inode->i_fop = &openpromfs_nodenum_ops;
-		inode->i_nlink = 1;
-		inode->u.generic_ip = (void *)(long)(n);
-		break;
-	case OPFSL_PROPERTY:
-		if ((dirnode == options) && (len == 17)
-		    && !strncmp (name, "security-password", 17))
+	case op_inode_prop:
+		if (!strcmp(dp->name, "options") && (len == 17) &&
+		    !strncmp (name, "security-password", 17))
 			inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
-		else {
+		else
 			inode->i_mode = S_IFREG | S_IRUGO;
-			if (dirnode == options || dirnode == aliases) {
-				if (len != 4 || strncmp (name, "name", 4))
-					inode->i_mode |= S_IWUSR;
-			}
-		}
 		inode->i_fop = &openpromfs_prop_ops;
 		inode->i_nlink = 1;
-		if (inode->i_size < 0)
-			inode->i_size = 0;
-		inode->u.generic_ip = (void *)(long)(((u16)dirnode) | 
-			(((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
+		inode->i_size = ent_oi->u.prop->length;
 		break;
 	}
 
@@ -781,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
 static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct op_inode_info *oi = OP_I(inode);
+	struct device_node *dp = oi->u.node;
+	struct device_node *child;
+	struct property *prop;
 	unsigned int ino;
-	u32 n;
-	int i, j;
-	char buffer[128];
-	u16 node;
-	char *p;
-	char buffer2[64];
-
-	lock_kernel();
+	int i;
+
+	mutex_lock(&op_mutex);
 	
 	ino = inode->i_ino;
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
 		i++;
 		filp->f_pos++;
 		/* fall thru */
 	case 1:
-		if (filldir(dirent, "..", 2, i, 
-			(NODE(ino).parent == 0xffff) ? 
-			OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) 
+		if (filldir(dirent, "..", 2, i,
+			    (dp->parent == NULL ?
+			     OPENPROM_ROOT_INO :
+			     dp->parent->unique_id), DT_DIR) < 0) 
 			goto out;
 		i++;
 		filp->f_pos++;
 		/* fall thru */
 	default:
 		i -= 2;
-		node = NODE(ino).child;
-		while (i && node != 0xffff) {
-			node = nodes[node].next;
+
+		/* First, the children nodes as directories.  */
+		child = dp->child;
+		while (i && child) {
+			child = child->sibling;
 			i--;
 		}
-		while (node != 0xffff) {
-			if (prom_getname (nodes[node].node, buffer, 128) < 0)
-				goto out;
-			if (filldir(dirent, buffer, strlen(buffer),
-				    filp->f_pos, NODE2INO(node), DT_DIR) < 0)
+		while (child) {
+			if (filldir(dirent,
+				    child->path_component_name,
+				    strlen(child->path_component_name),
+				    filp->f_pos, child->unique_id, DT_DIR) < 0)
 				goto out;
+
 			filp->f_pos++;
-			node = nodes[node].next;
+			child = child->sibling;
 		}
-		j = NODEP2INO(NODE(ino).first_prop);
-		if (!i) {
-			if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
+
+		/* Next, the properties as files.  */
+		prop = dp->properties;
+		while (i && prop) {
+			prop = prop->next;
+			i--;
+		}
+		while (prop) {
+			if (filldir(dirent, prop->name, strlen(prop->name),
+				    filp->f_pos, prop->unique_id, DT_REG) < 0)
 				goto out;
+
 			filp->f_pos++;
-		} else
-			i--;
-		n = NODE(ino).node;
-		if (ino == OPENPROM_FIRST_INO + aliases) {
-			for (j++; i < aliases_nodes; i++, j++) {
-				if (alias_names [i]) {
-					if (filldir (dirent, alias_names [i], 
-						strlen (alias_names [i]), 
-						filp->f_pos, j, DT_REG) < 0) goto out; 
-					filp->f_pos++;
-				}
-			}
-		} else {
-			for (p = prom_firstprop (n, buffer2);
-			     p && *p;
-			     p = prom_nextprop (n, p, buffer2)) {
-				j++;
-				if (i) i--;
-				else {
-					if (filldir(dirent, p, strlen(p),
-						    filp->f_pos, j, DT_REG) < 0)
-						goto out;
-					filp->f_pos++;
-				}
-			}
+			prop = prop->next;
 		}
 	}
 out:
-	unlock_kernel();
-	return 0;
-}
-
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
-{
-	char *p;
-	struct inode *inode;
-	
-	if (!dir)
-		return -ENOENT;
-	if (dentry->d_name.len > 256)
-		return -EINVAL;
-	p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
-	if (!p)
-		return -ENOMEM;
-	strncpy (p, dentry->d_name.name, dentry->d_name.len);
-	p [dentry->d_name.len] = 0;
-	lock_kernel();
-	if (aliases_nodes == ALIASES_NNODES) {
-		kfree(p);
-		unlock_kernel();
-		return -EIO;
-	}
-	alias_names [aliases_nodes++] = p;
-	inode = iget (dir->i_sb,
-			NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
-	if (!inode) {
-		unlock_kernel();
-		return -EINVAL;
-	}
-	inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
-	inode->i_fop = &openpromfs_prop_ops;
-	inode->i_nlink = 1;
-	if (inode->i_size < 0) inode->i_size = 0;
-	inode->u.generic_ip = (void *)(long)(((u16)aliases) | 
-			(((u16)(aliases_nodes - 1)) << 16));
-	unlock_kernel();
-	d_instantiate(dentry, inode);
+	mutex_unlock(&op_mutex);
 	return 0;
 }
 
-static int openpromfs_unlink (struct inode *dir, struct dentry *dentry)
-{
-	unsigned int len;
-	char *p;
-	const char *name;
-	int i;
-	
-	name = dentry->d_name.name;
-	len = dentry->d_name.len;
-	lock_kernel();
-	for (i = 0; i < aliases_nodes; i++)
-		if ((strlen (alias_names [i]) == len)
-		    && !strncmp (name, alias_names[i], len)) {
-			char buffer[512];
-			
-			p = alias_names [i];
-			alias_names [i] = NULL;
-			kfree (p);
-			strcpy (buffer, "nvunalias ");
-			memcpy (buffer + 10, name, len);
-			buffer [10 + len] = 0;
-			prom_feval (buffer);
-		}
-	unlock_kernel();
-	return 0;
-}
+static kmem_cache_t *op_inode_cachep;
 
-/* {{{ init section */
-static int __init check_space (u16 n)
+static struct inode *openprom_alloc_inode(struct super_block *sb)
 {
-	unsigned long pages;
+	struct op_inode_info *oi;
 
-	if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) {
-		pages = __get_free_pages (GFP_KERNEL, alloced + 1);
-		if (!pages)
-			return -1;
+	oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
+	if (!oi)
+		return NULL;
 
-		if (nodes) {
-			memcpy ((char *)pages, nodes,
-				(1 << alloced) * PAGE_SIZE);
-			free_pages ((unsigned long)nodes, alloced);
-		}
-		alloced++;
-		nodes = (openpromfs_node *)pages;
-	}
-	return 0;
+	return &oi->vfs_inode;
 }
 
-static u16 __init get_nodes (u16 parent, u32 node)
+static void openprom_destroy_inode(struct inode *inode)
 {
-	char *p;
-	u16 n = last_node++, i;
-	char buffer[64];
-
-	if (check_space (n) < 0)
-		return 0xffff;
-	nodes[n].parent = parent;
-	nodes[n].node = node;
-	nodes[n].next = 0xffff;
-	nodes[n].child = 0xffff;
-	nodes[n].first_prop = first_prop++;
-	if (!parent) {
-		char buffer[8];
-		int j;
-		
-		if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
-		    buffer[j] = 0;
-		    if (!strcmp (buffer, "options"))
-			options = n;
-		    else if (!strcmp (buffer, "aliases"))
-		        aliases = n;
-		}
-	}
-	if (n != aliases)
-		for (p = prom_firstprop (node, buffer);
-		     p && p != (char *)-1 && *p;
-		     p = prom_nextprop (node, p, buffer))
-			first_prop++;
-	else {
-		char *q;
-		for (p = prom_firstprop (node, buffer);
-		     p && p != (char *)-1 && *p;
-		     p = prom_nextprop (node, p, buffer)) {
-			if (aliases_nodes == ALIASES_NNODES)
-				break;
-			for (i = 0; i < aliases_nodes; i++)
-				if (!strcmp (p, alias_names [i]))
-					break;
-			if (i < aliases_nodes)
-				continue;
-			q = kmalloc (strlen (p) + 1, GFP_KERNEL);
-			if (!q)
-				return 0xffff;
-			strcpy (q, p);
-			alias_names [aliases_nodes++] = q;
-		}
-		first_prop += ALIASES_NNODES;
-	}
-	node = prom_getchild (node);
-	if (node) {
-		parent = get_nodes (n, node);
-		if (parent == 0xffff)
-			return 0xffff;
-		nodes[n].child = parent;
-		while ((node = prom_getsibling (node)) != 0) {
-			i = get_nodes (n, node);
-			if (i == 0xffff)
-				return 0xffff;
-			nodes[parent].next = i;
-			parent = i;
-		}
-	}
-	return n;
+	kmem_cache_free(op_inode_cachep, OP_I(inode));
 }
 
 static void openprom_read_inode(struct inode * inode)
@@ -1031,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
 }
 
 static struct super_operations openprom_sops = { 
+	.alloc_inode	= openprom_alloc_inode,
+	.destroy_inode	= openprom_destroy_inode,
 	.read_inode	= openprom_read_inode,
 	.statfs		= simple_statfs,
 	.remount_fs	= openprom_remount,
@@ -1038,7 +374,8 @@ static struct super_operations openprom_sops = {
 
 static int openprom_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct inode * root_inode;
+	struct inode *root_inode;
+	struct op_inode_info *oi;
 
 	s->s_flags |= MS_NOATIME;
 	s->s_blocksize = 1024;
@@ -1049,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
 	root_inode = iget(s, OPENPROM_ROOT_INO);
 	if (!root_inode)
 		goto out_no_root;
+
+	oi = OP_I(root_inode);
+	oi->type = op_inode_node;
+	oi->u.node = of_find_node_by_path("/");
+
 	s->s_root = d_alloc_root(root_inode);
 	if (!s->s_root)
 		goto out_no_root;
@@ -1073,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
+static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct op_inode_info *oi = (struct op_inode_info *) data;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&oi->vfs_inode);
+}
+
 static int __init init_openprom_fs(void)
 {
-	nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0);
-	if (!nodes) {
-		printk (KERN_WARNING "openpromfs: can't get free page\n");
-		return -EIO;
-	}
-	if (get_nodes (0xffff, prom_root_node) == 0xffff) {
-		printk (KERN_WARNING "openpromfs: couldn't setup tree\n");
-		return -EIO;
-	}
-	nodes[last_node].first_prop = first_prop;
-	return register_filesystem(&openprom_fs_type);
+	int err;
+
+	op_inode_cachep = kmem_cache_create("op_inode_cache",
+					    sizeof(struct op_inode_info),
+					    0,
+					    (SLAB_RECLAIM_ACCOUNT |
+					     SLAB_MEM_SPREAD),
+					    op_inode_init_once, NULL);
+	if (!op_inode_cachep)
+		return -ENOMEM;
+
+	err = register_filesystem(&openprom_fs_type);
+	if (err)
+		kmem_cache_destroy(op_inode_cachep);
+
+	return err;
 }
 
 static void __exit exit_openprom_fs(void)
 {
-	int i;
 	unregister_filesystem(&openprom_fs_type);
-	free_pages ((unsigned long)nodes, alloced);
-	for (i = 0; i < aliases_nodes; i++)
-		kfree (alias_names [i]);
-	nodes = NULL;
+	kmem_cache_destroy(op_inode_cachep);
 }
 
 module_init(init_openprom_fs)
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e0..da42ee61c1d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
 	if (master) {
 		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
 			slave_mnt->mnt_master = master;
-		list_del(&mnt->mnt_slave);
-		list_add(&mnt->mnt_slave, &master->mnt_slave_list);
+		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
 		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	} else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
 		 * umount the child only if the child has no
 		 * other children
 		 */
-		if (child && list_empty(&child->mnt_mounts)) {
-			list_del(&child->mnt_hash);
-			list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
-		}
+		if (child && list_empty(&child->mnt_mounts))
+			list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
 	}
 }
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6afff725a8c..6ba7785319d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,6 +74,16 @@
 #include <linux/poll.h>
 #include "internal.h"
 
+/* NOTE:
+ *	Implementing inode permission operations in /proc is almost
+ *	certainly an error.  Permission checks need to happen during
+ *	each system call not at open time.  The reason is that most of
+ *	what we wish to check for permissions in /proc varies at runtime.
+ *
+ *	The classic example of a problem is opening file descriptors
+ *	in /proc for a task before it execs a suid executable.
+ */
+
 /*
  * For hysterical raisins we keep the same inumbers as in the old procfs.
  * Feel free to change the macro below - just keep the range distinct from
@@ -121,6 +131,8 @@ enum pid_directory_inos {
 	PROC_TGID_ATTR_PREV,
 	PROC_TGID_ATTR_EXEC,
 	PROC_TGID_ATTR_FSCREATE,
+	PROC_TGID_ATTR_KEYCREATE,
+	PROC_TGID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TGID_LOGINUID,
@@ -162,6 +174,8 @@ enum pid_directory_inos {
 	PROC_TID_ATTR_PREV,
 	PROC_TID_ATTR_EXEC,
 	PROC_TID_ATTR_FSCREATE,
+	PROC_TID_ATTR_KEYCREATE,
+	PROC_TID_ATTR_SOCKCREATE,
 #endif
 #ifdef CONFIG_AUDITSYSCALL
 	PROC_TID_LOGINUID,
@@ -173,6 +187,9 @@ enum pid_directory_inos {
 	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
 };
 
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 10
+
 struct pid_entry {
 	int type;
 	int len;
@@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = {
 	E(PROC_TGID_ATTR_PREV,     "prev",     S_IFREG|S_IRUGO),
 	E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = {
 	E(PROC_TID_ATTR_PREV,      "prev",     S_IFREG|S_IRUGO),
 	E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
 	E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+	E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
 	{0,0,NULL,0}
 };
 #endif
@@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = {
 
 static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct task_struct *task = proc_task(inode);
-	struct files_struct *files;
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
 	struct file *file;
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	int fd = proc_fd(inode);
 
-	files = get_files_struct(task);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
 	if (files) {
 		/*
 		 * We are not taking a ref to the file structure, so we must
@@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
 	return fs;
 }
 
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int get_nr_threads(struct task_struct *tsk)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
-	int result = -ENOENT;
-	if (fs) {
-		read_lock(&fs->lock);
-		*mnt = mntget(fs->pwdmnt);
-		*dentry = dget(fs->pwd);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
+	/* Must be called with the rcu_read_lock held */
+	unsigned long flags;
+	int count = 0;
+
+	if (lock_task_sighand(tsk, &flags)) {
+		count = atomic_read(&tsk->signal->count);
+		unlock_task_sighand(tsk, &flags);
 	}
-	return result;
+	return count;
 }
 
-static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs = get_fs_struct(proc_task(inode));
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
+
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
+	}
 	if (fs) {
 		read_lock(&fs->lock);
-		*mnt = mntget(fs->rootmnt);
-		*dentry = dget(fs->root);
+		*mnt = mntget(fs->pwdmnt);
+		*dentry = dget(fs->pwd);
 		read_unlock(&fs->lock);
 		result = 0;
 		put_fs_struct(fs);
@@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
 	return result;
 }
 
-
-/* Same as proc_root_link, but this addionally tries to get fs from other
- * threads in the group */
-static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
-				struct vfsmount **mnt)
+static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-	struct fs_struct *fs;
+	struct task_struct *task = get_proc_task(inode);
+	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
-	struct task_struct *leader = proc_task(inode);
 
-	task_lock(leader);
-	fs = leader->fs;
-	if (fs) {
-		atomic_inc(&fs->count);
-		task_unlock(leader);
-	} else {
-		/* Try to get fs from other threads */
-		task_unlock(leader);
-		read_lock(&tasklist_lock);
-		if (pid_alive(leader)) {
-			struct task_struct *task = leader;
-
-			while ((task = next_thread(task)) != leader) {
-				task_lock(task);
-				fs = task->fs;
-				if (fs) {
-					atomic_inc(&fs->count);
-					task_unlock(task);
-					break;
-				}
-				task_unlock(task);
-			}
-		}
-		read_unlock(&tasklist_lock);
+	if (task) {
+		fs = get_fs_struct(task);
+		put_task_struct(task);
 	}
-
 	if (fs) {
 		read_lock(&fs->lock);
 		*mnt = mntget(fs->rootmnt);
@@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
 	return result;
 }
 
-
 #define MAY_PTRACE(task) \
 	(task == current || \
 	(task->parent == current && \
@@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 /************************************************************************/
 
 /* permission checks */
-
-/* If the process being read is separated by chroot from the reading process,
- * don't let the reader access the threads.
- *
- * note: this does dput(root) and mntput(vfsmnt) on exit.
- */
-static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
-{
-	struct dentry *de, *base;
-	struct vfsmount *our_vfsmnt, *mnt;
-	int res = 0;
-
-	read_lock(&current->fs->lock);
-	our_vfsmnt = mntget(current->fs->rootmnt);
-	base = dget(current->fs->root);
-	read_unlock(&current->fs->lock);
-
-	spin_lock(&vfsmount_lock);
-	de = root;
-	mnt = vfsmnt;
-
-	while (mnt != our_vfsmnt) {
-		if (mnt == mnt->mnt_parent)
-			goto out;
-		de = mnt->mnt_mountpoint;
-		mnt = mnt->mnt_parent;
-	}
-
-	if (!is_subdir(de, base))
-		goto out;
-	spin_unlock(&vfsmount_lock);
-
-exit:
-	dput(base);
-	mntput(our_vfsmnt);
-	dput(root);
-	mntput(vfsmnt);
-	return res;
-out:
-	spin_unlock(&vfsmount_lock);
-	res = -EACCES;
-	goto exit;
-}
-
-static int proc_check_root(struct inode *inode)
-{
-	struct dentry *root;
-	struct vfsmount *vfsmnt;
-
-	if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
-		return -ENOENT;
-	return proc_check_chroot(root, vfsmnt);
-}
-
-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	if (generic_permission(inode, mask, NULL) != 0)
-		return -EACCES;
-	return proc_check_root(inode);
-}
-
-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	struct dentry *root;
-	struct vfsmount *vfsmnt;
-
-	if (generic_permission(inode, mask, NULL) != 0)
-		return -EACCES;
-
-	if (proc_task_root_link(inode, &root, &vfsmnt))
-		return -ENOENT;
-
-	return proc_check_chroot(root, vfsmnt);
-}
-
-extern struct seq_operations proc_pid_maps_op;
-static int maps_open(struct inode *inode, struct file *file)
-{
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_maps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
-	}
-	return ret;
-}
-
-static struct file_operations proc_maps_operations = {
-	.open		= maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-#ifdef CONFIG_NUMA
-extern struct seq_operations proc_pid_numa_maps_op;
-static int numa_maps_open(struct inode *inode, struct file *file)
-{
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_numa_maps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
-	}
-	return ret;
-}
-
-static struct file_operations proc_numa_maps_operations = {
-	.open		= numa_maps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
-#ifdef CONFIG_MMU
-extern struct seq_operations proc_pid_smaps_op;
-static int smaps_open(struct inode *inode, struct file *file)
+static int proc_fd_access_allowed(struct inode *inode)
 {
-	struct task_struct *task = proc_task(inode);
-	int ret = seq_open(file, &proc_pid_smaps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = task;
+	struct task_struct *task;
+	int allowed = 0;
+	/* Allow access to a task's file descriptors if it is us or we
+	 * may use ptrace attach to the process and find out that
+	 * information.
+	 */
+	task = get_proc_task(inode);
+	if (task) {
+		allowed = ptrace_may_attach(task);
+		put_task_struct(task);
 	}
-	return ret;
+	return allowed;
 }
 
-static struct file_operations proc_smaps_operations = {
-	.open		= smaps_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-#endif
-
 extern struct seq_operations mounts_op;
 struct proc_mounts {
 	struct seq_file m;
@@ -679,16 +560,19 @@ struct proc_mounts {
 
 static int mounts_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
-	struct namespace *namespace;
+	struct task_struct *task = get_proc_task(inode);
+	struct namespace *namespace = NULL;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
 
-	task_lock(task);
-	namespace = task->namespace;
-	if (namespace)
-		get_namespace(namespace);
-	task_unlock(task);
+	if (task) {
+		task_lock(task);
+		namespace = task->namespace;
+		if (namespace)
+			get_namespace(namespace);
+		task_unlock(task);
+		put_task_struct(task);
+	}
 
 	if (namespace) {
 		ret = -ENOMEM;
@@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = {
 extern struct seq_operations mountstats_op;
 static int mountstats_open(struct inode *inode, struct file *file)
 {
-	struct task_struct *task = proc_task(inode);
 	int ret = seq_open(file, &mountstats_op);
 
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		struct namespace *namespace;
-		task_lock(task);
-		namespace = task->namespace;
-		if (namespace)
-			get_namespace(namespace);
-		task_unlock(task);
+		struct namespace *namespace = NULL;
+		struct task_struct *task = get_proc_task(inode);
+
+		if (task) {
+			task_lock(task);
+			namespace = task->namespace;
+			if (namespace)
+				get_namespace(namespace);
+			task_unlock(task);
+			put_task_struct(task);
+		}
 
 		if (namespace)
 			m->private = namespace;
@@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PROC_BLOCK_SIZE)
 		count = PROC_BLOCK_SIZE;
+
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = PROC_I(inode)->op.proc_read(task, (char*)page);
 
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file)
 static ssize_t mem_read(struct file * file, char __user * buf,
 			size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	char *page;
 	unsigned long src = *ppos;
 	int ret = -ESRCH;
 	struct mm_struct *mm;
 
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
 		goto out;
 
@@ -865,6 +765,8 @@ out_put:
 out_free:
 	free_page((unsigned long) page);
 out:
+	put_task_struct(task);
+out_no_task:
 	return ret;
 }
 
@@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
 {
 	int copied = 0;
 	char *page;
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
 
+	copied = -ESRCH;
+	if (!task)
+		goto out_no_task;
+
 	if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
-		return -ESRCH;
+		goto out;
 
+	copied = -ENOMEM;
 	page = (char *)__get_free_page(GFP_USER);
 	if (!page)
-		return -ENOMEM;
+		goto out;
 
 	while (count > 0) {
 		int this_len, retval;
@@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	}
 	*ppos = dst;
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return copied;
 }
 #endif
@@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = {
 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[8];
+	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+	char buffer[PROC_NUMBUF];
 	size_t len;
-	int oom_adjust = task->oomkilladj;
+	int oom_adjust;
 	loff_t __ppos = *ppos;
 
-	len = sprintf(buffer, "%i\n", oom_adjust);
+	if (!task)
+		return -ESRCH;
+	oom_adjust = task->oomkilladj;
+	put_task_struct(task);
+
+	len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 	if (__ppos >= len)
 		return 0;
 	if (count > len-__ppos)
@@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
-	struct task_struct *task = proc_task(file->f_dentry->d_inode);
-	char buffer[8], *end;
+	struct task_struct *task;
+	char buffer[PROC_NUMBUF], *end;
 	int oom_adjust;
 
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
-	memset(buffer, 0, 8);
-	if (count > 6)
-		count = 6;
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
 	oom_adjust = simple_strtol(buffer, &end, 0);
@@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 		return -EINVAL;
 	if (*end == '\n')
 		end++;
+	task = get_proc_task(file->f_dentry->d_inode);
+	if (!task)
+		return -ESRCH;
 	task->oomkilladj = oom_adjust;
+	put_task_struct(task);
 	if (end - buffer == 0)
 		return -EIO;
 	return end - buffer;
@@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = {
 	.write		= oom_adjust_write,
 };
 
-static struct inode_operations proc_mem_inode_operations = {
-	.permission	= proc_permission,
-};
-
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
 	struct inode * inode = file->f_dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
 	ssize_t length;
 	char tmpbuf[TMPBUFLEN];
 
+	if (!task)
+		return -ESRCH;
 	length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
 				audit_get_loginuid(task->audit_context));
+	put_task_struct(task);
 	return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
 }
 
@@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page, *tmp;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
 	uid_t loginuid;
 
 	if (!capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 
-	if (current != task)
+	if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
 		return -EPERM;
 
 	if (count >= PAGE_SIZE)
@@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
 		goto out_free_page;
 
 	}
-	length = audit_set_loginuid(task, loginuid);
+	length = audit_set_loginuid(current, loginuid);
 	if (likely(length == 0))
 		length = count;
 
@@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = {
 static ssize_t seccomp_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20];
 	loff_t __ppos = *ppos;
 	size_t len;
 
+	if (!tsk)
+		return -ESRCH;
 	/* no need to print the trailing zero, so use only len */
 	len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+	put_task_struct(tsk);
 	if (__ppos >= len)
 		return 0;
 	if (count > len - __ppos)
@@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
 static ssize_t seccomp_write(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 {
-	struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+	struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
 	char __buf[20], *end;
 	unsigned int seccomp_mode;
+	ssize_t result;
+
+	result = -ESRCH;
+	if (!tsk)
+		goto out_no_task;
 
 	/* can set it only once to be even more secure */
+	result = -EPERM;
 	if (unlikely(tsk->seccomp.mode))
-		return -EPERM;
+		goto out;
 
+	result = -EFAULT;
 	memset(__buf, 0, sizeof(__buf));
 	count = min(count, sizeof(__buf) - 1);
 	if (copy_from_user(__buf, buf, count))
-		return -EFAULT;
+		goto out;
+
 	seccomp_mode = simple_strtoul(__buf, &end, 0);
 	if (*end == '\n')
 		end++;
+	result = -EINVAL;
 	if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
 		tsk->seccomp.mode = seccomp_mode;
 		set_tsk_thread_flag(tsk, TIF_SECCOMP);
 	} else
-		return -EINVAL;
+		goto out;
+	result = -EIO;
 	if (unlikely(!(end - __buf)))
-		return -EIO;
-	return end - __buf;
+		goto out;
+	result = end - __buf;
+out:
+	put_task_struct(tsk);
+out_no_task:
+	return result;
 }
 
 static struct file_operations proc_seccomp_operations = {
@@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	/* We don't need a base pointer in the /proc filesystem */
 	path_release(nd);
 
-	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
-		goto out;
-	error = proc_check_root(inode);
-	if (error)
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
@@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	struct dentry *de;
 	struct vfsmount *mnt = NULL;
 
-	lock_kernel();
-
-	if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
-		goto out;
-	error = proc_check_root(inode);
-	if (error)
+	/* Are we allowed to snoop on the tasks file descriptors? */
+	if (!proc_fd_access_allowed(inode))
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
@@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	dput(de);
 	mntput(mnt);
 out:
-	unlock_kernel();
 	return error;
 }
 
@@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = {
 	.follow_link	= proc_pid_follow_link
 };
 
-#define NUMBUF 10
-
 static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 {
-	struct inode *inode = filp->f_dentry->d_inode;
-	struct task_struct *p = proc_task(inode);
+	struct dentry *dentry = filp->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
 	unsigned int fd, tid, ino;
 	int retval;
-	char buf[NUMBUF];
+	char buf[PROC_NUMBUF];
 	struct files_struct * files;
 	struct fdtable *fdt;
 
 	retval = -ENOENT;
-	if (!pid_alive(p))
-		goto out;
+	if (!p)
+		goto out_no_task;
 	retval = 0;
 	tid = p->pid;
 
@@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 				goto out;
 			filp->f_pos++;
 		case 1:
-			ino = fake_ino(tid, PROC_TID_INO);
+			ino = parent_ino(dentry);
 			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
 				goto out;
 			filp->f_pos++;
@@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 					continue;
 				rcu_read_unlock();
 
-				j = NUMBUF;
+				j = PROC_NUMBUF;
 				i = fd;
 				do {
 					j--;
@@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 				} while (i);
 
 				ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
-				if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
+				if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
 					rcu_read_lock();
 					break;
 				}
@@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
 			put_files_struct(files);
 	}
 out:
+	put_task_struct(p);
+out_no_task:
 	return retval;
 }
 
@@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp,
 	int pid;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
 	struct pid_entry *p;
 	ino_t ino;
 	int ret;
 
 	ret = -ENOENT;
-	if (!pid_alive(proc_task(inode)))
+	if (!task)
 		goto out;
 
 	ret = 0;
-	pid = proc_task(inode)->pid;
+	pid = task->pid;
+	put_task_struct(task);
 	i = filp->f_pos;
 	switch (i) {
 	case 0:
@@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 
 	/* Common stuff */
 	ei = PROC_I(inode);
-	ei->task = NULL;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_ino = fake_ino(task->pid, ino);
 
-	if (!pid_alive(task))
-		goto out_unlock;
-
 	/*
 	 * grab the reference to task.
 	 */
-	get_task_struct(task);
-	ei->task = task;
-	ei->type = ino;
+	ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
+	if (!ei->pid)
+		goto out_unlock;
+
 	inode->i_uid = 0;
 	inode->i_gid = 0;
-	if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) {
+	if (task_dumpable(task)) {
 		inode->i_uid = task->euid;
 		inode->i_gid = task->egid;
 	}
@@ -1379,7 +1306,6 @@ out:
 	return inode;
 
 out_unlock:
-	ei->pde = NULL;
 	iput(inode);
 	return NULL;
 }
@@ -1393,13 +1319,21 @@ out_unlock:
  *
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid.  Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid.  To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
  */
 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
-	if (pid_alive(task)) {
-		if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) {
+	struct task_struct *task = get_proc_task(inode);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
 			inode->i_uid = task->euid;
 			inode->i_gid = task->egid;
 		} else {
@@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 			inode->i_gid = 0;
 		}
 		security_task_to_inode(task, inode);
+		put_task_struct(task);
 		return 1;
 	}
 	d_drop(dentry);
 	return 0;
 }
 
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			stat->uid = task->euid;
+			stat->gid = task->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = proc_task(inode);
-	int fd = proc_type(inode) - PROC_TID_FD_DIR;
+	struct task_struct *task = get_proc_task(inode);
+	int fd = proc_fd(inode);
 	struct files_struct *files;
 
-	files = get_files_struct(task);
-	if (files) {
-		rcu_read_lock();
-		if (fcheck_files(files, fd)) {
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			rcu_read_lock();
+			if (fcheck_files(files, fd)) {
+				rcu_read_unlock();
+				put_files_struct(files);
+				if (task_dumpable(task)) {
+					inode->i_uid = task->euid;
+					inode->i_gid = task->egid;
+				} else {
+					inode->i_uid = 0;
+					inode->i_gid = 0;
+				}
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
 			rcu_read_unlock();
 			put_files_struct(files);
-			if (task_dumpable(task)) {
-				inode->i_uid = task->euid;
-				inode->i_gid = task->egid;
-			} else {
-				inode->i_uid = 0;
-				inode->i_gid = 0;
-			}
-			security_task_to_inode(task, inode);
-			return 1;
 		}
-		rcu_read_unlock();
-		put_files_struct(files);
+		put_task_struct(task);
 	}
 	d_drop(dentry);
 	return 0;
 }
 
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
-	struct task_struct *task = proc_task(inode);
-	spin_lock(&task->proc_lock);
-	if (task->proc_dentry == dentry)
-		task->proc_dentry = NULL;
-	spin_unlock(&task->proc_lock);
-	iput(inode);
-}
-
 static int pid_delete_dentry(struct dentry * dentry)
 {
 	/* Is the task we represent dead?
 	 * If so, then don't put the dentry on the lru list,
 	 * kill it immediately.
 	 */
-	return !pid_alive(proc_task(dentry->d_inode));
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
 static struct dentry_operations tid_fd_dentry_operations =
@@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations =
 	.d_delete	= pid_delete_dentry,
 };
 
-static struct dentry_operations pid_base_dentry_operations =
-{
-	.d_revalidate	= pid_revalidate,
-	.d_iput		= pid_base_iput,
-	.d_delete	= pid_delete_dentry,
-};
-
 /* Lookups */
 
 static unsigned name_to_int(struct dentry *dentry)
@@ -1508,22 +1451,24 @@ out:
 /* SMP-safe */
 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
-	struct task_struct *task = proc_task(dir);
+	struct task_struct *task = get_proc_task(dir);
 	unsigned fd = name_to_int(dentry);
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct file * file;
 	struct files_struct * files;
 	struct inode *inode;
 	struct proc_inode *ei;
 
+	if (!task)
+		goto out_no_task;
 	if (fd == ~0U)
 		goto out;
-	if (!pid_alive(task))
-		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
 	if (!inode)
 		goto out;
 	ei = PROC_I(inode);
+	ei->fd = fd;
 	files = get_files_struct(task);
 	if (!files)
 		goto out_unlock;
@@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	ei->op.proc_get_link = proc_fd_link;
 	dentry->d_op = &tid_fd_dentry_operations;
 	d_add(dentry, inode);
-	return NULL;
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, NULL))
+		result = NULL;
+out:
+	put_task_struct(task);
+out_no_task:
+	return result;
 
 out_unlock2:
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
-out:
-	return ERR_PTR(-ENOENT);
+	goto out;
 }
 
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
 
 static struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
@@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = {
  */
 static struct inode_operations proc_fd_inode_operations = {
 	.lookup		= proc_lookupfd,
-	.permission	= proc_permission,
 };
 
 static struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
-	.permission	= proc_task_permission,
+	.getattr	= proc_task_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	unsigned long page;
 	ssize_t length;
-	struct task_struct *task = proc_task(inode);
+	struct task_struct *task = get_proc_task(inode);
+
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
+	length = -ENOMEM;
 	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+		goto out;
 
 	length = security_getprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
@@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 	free_page(page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 }
 
@@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	struct inode * inode = file->f_dentry->d_inode;
 	char *page; 
 	ssize_t length; 
-	struct task_struct *task = proc_task(inode); 
+	struct task_struct *task = get_proc_task(inode);
 
+	length = -ESRCH;
+	if (!task)
+		goto out_no_task;
 	if (count > PAGE_SIZE) 
 		count = PAGE_SIZE; 
-	if (*ppos != 0) {
-		/* No partial writes. */
-		return -EINVAL;
-	}
+
+	/* No partial writes. */
+	length = -EINVAL;
+	if (*ppos != 0)
+		goto out;
+
+	length = -ENOMEM;
 	page = (char*)__get_free_page(GFP_USER); 
 	if (!page) 
-		return -ENOMEM;
+		goto out;
+
 	length = -EFAULT; 
 	if (copy_from_user(page, buf, count)) 
-		goto out;
+		goto out_free;
 
 	length = security_setprocattr(task, 
 				      (char*)file->f_dentry->d_name.name, 
 				      (void*)page, count);
-out:
+out_free:
 	free_page((unsigned long) page);
+out:
+	put_task_struct(task);
+out_no_task:
 	return length;
 } 
 
@@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations;
 static struct inode_operations proc_tgid_attr_inode_operations;
 #endif
 
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
-
 /* SMP-safe */
 static struct dentry *proc_pident_lookup(struct inode *dir, 
 					 struct dentry *dentry,
 					 struct pid_entry *ents)
 {
 	struct inode *inode;
-	int error;
-	struct task_struct *task = proc_task(dir);
+	struct dentry *error;
+	struct task_struct *task = get_proc_task(dir);
 	struct pid_entry *p;
 	struct proc_inode *ei;
 
-	error = -ENOENT;
+	error = ERR_PTR(-ENOENT);
 	inode = NULL;
 
-	if (!pid_alive(task))
-		goto out;
+	if (!task)
+		goto out_no_task;
 
 	for (p = ents; p->name; p++) {
 		if (p->len != dentry->d_name.len)
@@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	if (!p->name)
 		goto out;
 
-	error = -EINVAL;
+	error = ERR_PTR(-EINVAL);
 	inode = proc_pid_make_inode(dir->i_sb, task, p->type);
 	if (!inode)
 		goto out;
@@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	 */
 	switch(p->type) {
 		case PROC_TGID_TASK:
-			inode->i_nlink = 2 + get_tid_list(2, NULL, dir);
+			inode->i_nlink = 2;
 			inode->i_op = &proc_task_inode_operations;
 			inode->i_fop = &proc_task_operations;
 			break;
@@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 #endif
 		case PROC_TID_MEM:
 		case PROC_TGID_MEM:
-			inode->i_op = &proc_mem_inode_operations;
 			inode->i_fop = &proc_mem_operations;
 			break;
 #ifdef CONFIG_SECCOMP
@@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		case PROC_TGID_ATTR_EXEC:
 		case PROC_TID_ATTR_FSCREATE:
 		case PROC_TGID_ATTR_FSCREATE:
+		case PROC_TID_ATTR_KEYCREATE:
+		case PROC_TGID_ATTR_KEYCREATE:
+		case PROC_TID_ATTR_SOCKCREATE:
+		case PROC_TGID_ATTR_SOCKCREATE:
 			inode->i_fop = &proc_pid_attr_operations;
 			break;
 #endif
@@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 		default:
 			printk("procfs: impossible type (%d)",p->type);
 			iput(inode);
-			return ERR_PTR(-EINVAL);
+			error = ERR_PTR(-EINVAL);
+			goto out;
 	}
 	dentry->d_op = &pid_dentry_operations;
 	d_add(dentry, inode);
-	return NULL;
-
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
 out:
-	return ERR_PTR(error);
+	put_task_struct(task);
+out_no_task:
+	return error;
 }
 
 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = {
 
 static struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_base_inode_operations = {
 	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
 };
 
 #ifdef CONFIG_SECURITY
@@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
 
 static struct inode_operations proc_tgid_attr_inode_operations = {
 	.lookup		= proc_tgid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 
 static struct inode_operations proc_tid_attr_inode_operations = {
 	.lookup		= proc_tid_attr_lookup,
+	.getattr	= pid_getattr,
 };
 #endif
 
@@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 			      int buflen)
 {
-	char tmp[30];
+	char tmp[PROC_NUMBUF];
 	sprintf(tmp, "%d", current->tgid);
 	return vfs_readlink(dentry,buffer,buflen,tmp);
 }
 
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	char tmp[30];
+	char tmp[PROC_NUMBUF];
 	sprintf(tmp, "%d", current->tgid);
 	return ERR_PTR(vfs_follow_link(nd,tmp));
 }	
@@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = {
 };
 
 /**
- * proc_pid_unhash -  Unhash /proc/@pid entry from the dcache.
- * @p: task that should be flushed.
+ * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
  *
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits.  After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead.  This routine is proved to flush those useless
+ * dcache entries at process exit time.
  *
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
- * if the pid value is immediately reused. This is enforced by
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- *   the target is not dead by looking at the attach count
- *   of PIDTYPE_PID.
+ * NOTE: This routine is just an optimization so it does not guarantee
+ *       that no dcache entries will exist at process exit time it
+ *       just makes it very unlikely that any will persist.
  */
-
-struct dentry *proc_pid_unhash(struct task_struct *p)
+void proc_flush_task(struct task_struct *task)
 {
-	struct dentry *proc_dentry;
+	struct dentry *dentry, *leader, *dir;
+	char buf[PROC_NUMBUF];
+	struct qstr name;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
+	}
 
-	proc_dentry = p->proc_dentry;
-	if (proc_dentry != NULL) {
+	if (thread_group_leader(task))
+		goto out;
 
-		spin_lock(&dcache_lock);
-		spin_lock(&proc_dentry->d_lock);
-		if (!d_unhashed(proc_dentry)) {
-			dget_locked(proc_dentry);
-			__d_drop(proc_dentry);
-			spin_unlock(&proc_dentry->d_lock);
-		} else {
-			spin_unlock(&proc_dentry->d_lock);
-			proc_dentry = NULL;
-		}
-		spin_unlock(&dcache_lock);
-	}
-	return proc_dentry;
-}
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
+	leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+	if (!leader)
+		goto out;
 
-/**
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
- * @proc_dentry: directoy to prune.
- *
- * Shrink the /proc directory that was used by the just killed thread.
- */
-	
-void proc_pid_flush(struct dentry *proc_dentry)
-{
-	might_sleep();
-	if(proc_dentry != NULL) {
-		shrink_dcache_parent(proc_dentry);
-		dput(proc_dentry);
+	name.name = "task";
+	name.len = strlen(name.name);
+	dir = d_hash_and_lookup(leader, &name);
+	if (!dir)
+		goto out_put_leader;
+
+	name.name = buf;
+	name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+	dentry = d_hash_and_lookup(dir, &name);
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		d_drop(dentry);
+		dput(dentry);
 	}
+
+	dput(dir);
+out_put_leader:
+	dput(leader);
+out:
+	return;
 }
 
 /* SMP-safe */
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
 	struct inode *inode;
 	struct proc_inode *ei;
 	unsigned tgid;
-	int died;
 
 	if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
 		inode = new_inode(dir->i_sb);
@@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	if (tgid == ~0U)
 		goto out;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	task = find_task_by_pid(tgid);
 	if (task)
 		get_task_struct(task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	if (!task)
 		goto out;
 
 	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+	if (!inode)
+		goto out_put_task;
 
-
-	if (!inode) {
-		put_task_struct(task);
-		goto out;
-	}
 	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
@@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	inode->i_nlink = 4;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
-	died = 0;
 	d_add(dentry, inode);
-	spin_lock(&task->proc_lock);
-	task->proc_dentry = dentry;
-	if (!pid_alive(task)) {
-		dentry = proc_pid_unhash(task);
-		died = 1;
-	}
-	spin_unlock(&task->proc_lock);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		result = NULL;
 
+out_put_task:
 	put_task_struct(task);
-	if (died) {
-		proc_pid_flush(dentry);
-		goto out;
-	}
-	return NULL;
 out:
-	return ERR_PTR(-ENOENT);
+	return result;
 }
 
 /* SMP-safe */
 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
+	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
-	struct task_struct *leader = proc_task(dir);
+	struct task_struct *leader = get_proc_task(dir);
 	struct inode *inode;
 	unsigned tid;
 
+	if (!leader)
+		goto out_no_task;
+
 	tid = name_to_int(dentry);
 	if (tid == ~0U)
 		goto out;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	task = find_task_by_pid(tid);
 	if (task)
 		get_task_struct(task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	if (!task)
 		goto out;
 	if (leader->tgid != task->tgid)
@@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
 	inode->i_nlink = 3;
 #endif
 
-	dentry->d_op = &pid_base_dentry_operations;
+	dentry->d_op = &pid_dentry_operations;
 
 	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		result = NULL;
 
-	put_task_struct(task);
-	return NULL;
 out_drop_task:
 	put_task_struct(task);
 out:
-	return ERR_PTR(-ENOENT);
+	put_task_struct(leader);
+out_no_task:
+	return result;
 }
 
-#define PROC_NUMBUF 10
-#define PROC_MAXPIDS 20
-
 /*
- * Get a few tgid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the first tgid to return to user space.
+ *
+ * Usually this is just whatever follows &init_task, but if the users
+ * buffer was too small to hold the full list or there was a seek into
+ * the middle of the directory we have more work to do.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with &init_task and walk nr
+ * threads past it.
  */
-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
-{
-	struct task_struct *p;
-	int nr_tgids = 0;
-
-	index--;
-	read_lock(&tasklist_lock);
-	p = NULL;
-	if (version) {
-		p = find_task_by_pid(version);
-		if (p && !thread_group_leader(p))
-			p = NULL;
+static struct task_struct *first_tgid(int tgid, unsigned int nr)
+{
+	struct task_struct *pos;
+	rcu_read_lock();
+	if (tgid && nr) {
+		pos = find_task_by_pid(tgid);
+		if (pos && thread_group_leader(pos))
+			goto found;
 	}
+	/* If nr exceeds the number of processes get out quickly */
+	pos = NULL;
+	if (nr && nr >= nr_processes())
+		goto done;
 
-	if (p)
-		index = 0;
-	else
-		p = next_task(&init_task);
-
-	for ( ; p != &init_task; p = next_task(p)) {
-		int tgid = p->pid;
-		if (!pid_alive(p))
-			continue;
-		if (--index >= 0)
-			continue;
-		tgids[nr_tgids] = tgid;
-		nr_tgids++;
-		if (nr_tgids >= PROC_MAXPIDS)
-			break;
+	/* If we haven't found our starting place yet start with
+	 * the init_task and walk nr tasks forward.
+	 */
+	for (pos = next_task(&init_task); nr > 0; --nr) {
+		pos = next_task(pos);
+		if (pos == &init_task) {
+			pos = NULL;
+			goto done;
+		}
 	}
-	read_unlock(&tasklist_lock);
-	return nr_tgids;
+found:
+	get_task_struct(pos);
+done:
+	rcu_read_unlock();
+	return pos;
 }
 
 /*
- * Get a few tid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the next task in the task list.
+ * Return NULL if we loop or there is any error.
+ *
+ * The reference to the input task_struct is released.
  */
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir)
-{
-	struct task_struct *leader_task = proc_task(dir);
-	struct task_struct *task = leader_task;
-	int nr_tids = 0;
-
-	index -= 2;
-	read_lock(&tasklist_lock);
-	/*
-	 * The starting point task (leader_task) might be an already
-	 * unlinked task, which cannot be used to access the task-list
-	 * via next_thread().
-	 */
-	if (pid_alive(task)) do {
-		int tid = task->pid;
-
-		if (--index >= 0)
-			continue;
-		if (tids != NULL)
-			tids[nr_tids] = tid;
-		nr_tids++;
-		if (nr_tids >= PROC_MAXPIDS)
-			break;
-	} while ((task = next_thread(task)) != leader_task);
-	read_unlock(&tasklist_lock);
-	return nr_tids;
+static struct task_struct *next_tgid(struct task_struct *start)
+{
+	struct task_struct *pos;
+	rcu_read_lock();
+	pos = start;
+	if (pid_alive(start))
+		pos = next_task(start);
+	if (pid_alive(pos) && (pos != &init_task)) {
+		get_task_struct(pos);
+		goto done;
+	}
+	pos = NULL;
+done:
+	rcu_read_unlock();
+	put_task_struct(start);
+	return pos;
 }
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int tgid_array[PROC_MAXPIDS];
 	char buf[PROC_NUMBUF];
 	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
-	unsigned int nr_tgids, i;
-	int next_tgid;
+	struct task_struct *task;
+	int tgid;
 
 	if (!nr) {
 		ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		filp->f_pos++;
 		nr++;
 	}
+	nr -= 1;
 
 	/* f_version caches the tgid value that the last readdir call couldn't
 	 * return. lseek aka telldir automagically resets f_version to 0.
 	 */
-	next_tgid = filp->f_version;
+	tgid = filp->f_version;
 	filp->f_version = 0;
-	for (;;) {
-		nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
-		if (!nr_tgids) {
-			/* no more entries ! */
+	for (task = first_tgid(tgid, nr);
+	     task;
+	     task = next_tgid(task), filp->f_pos++) {
+		int len;
+		ino_t ino;
+		tgid = task->pid;
+		len = snprintf(buf, sizeof(buf), "%d", tgid);
+		ino = fake_ino(tgid, PROC_TGID_INO);
+		if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			filp->f_version = tgid;
+			put_task_struct(task);
 			break;
 		}
-		next_tgid = 0;
+	}
+	return 0;
+}
 
-		/* do not use the last found pid, reserve it for next_tgid */
-		if (nr_tgids == PROC_MAXPIDS) {
-			nr_tgids--;
-			next_tgid = tgid_array[nr_tgids];
-		}
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader,
+					int tid, int nr)
+{
+	struct task_struct *pos;
 
-		for (i=0;i<nr_tgids;i++) {
-			int tgid = tgid_array[i];
-			ino_t ino = fake_ino(tgid,PROC_TGID_INO);
-			unsigned long j = PROC_NUMBUF;
+	rcu_read_lock();
+	/* Attempt to start with the pid of a thread */
+	if (tid && (nr > 0)) {
+		pos = find_task_by_pid(tid);
+		if (pos && (pos->group_leader == leader))
+			goto found;
+	}
 
-			do
-				buf[--j] = '0' + (tgid % 10);
-			while ((tgid /= 10) != 0);
+	/* If nr exceeds the number of threads there is nothing todo */
+	pos = NULL;
+	if (nr && nr >= get_nr_threads(leader))
+		goto out;
 
-			if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) {
-				/* returning this tgid failed, save it as the first
-				 * pid for the next readir call */
-				filp->f_version = tgid_array[i];
-				goto out;
-			}
-			filp->f_pos++;
-			nr++;
+	/* If we haven't found our starting place yet start
+	 * with the leader and walk nr threads forward.
+	 */
+	for (pos = leader; nr > 0; --nr) {
+		pos = next_thread(pos);
+		if (pos == leader) {
+			pos = NULL;
+			goto out;
 		}
 	}
+found:
+	get_task_struct(pos);
 out:
-	return 0;
+	rcu_read_unlock();
+	return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+	struct task_struct *pos = NULL;
+	rcu_read_lock();
+	if (pid_alive(start)) {
+		pos = next_thread(start);
+		if (thread_group_leader(pos))
+			pos = NULL;
+		else
+			get_task_struct(pos);
+	}
+	rcu_read_unlock();
+	put_task_struct(start);
+	return pos;
 }
 
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	unsigned int tid_array[PROC_MAXPIDS];
 	char buf[PROC_NUMBUF];
-	unsigned int nr_tids, i;
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
+	struct task_struct *leader = get_proc_task(inode);
+	struct task_struct *task;
 	int retval = -ENOENT;
 	ino_t ino;
+	int tid;
 	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 
-	if (!pid_alive(proc_task(inode)))
-		goto out;
+	if (!leader)
+		goto out_no_task;
 	retval = 0;
 
 	switch (pos) {
@@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		/* fall through */
 	}
 
-	nr_tids = get_tid_list(pos, tid_array, inode);
-	inode->i_nlink = pos + nr_tids;
-
-	for (i = 0; i < nr_tids; i++) {
-		unsigned long j = PROC_NUMBUF;
-		int tid = tid_array[i];
-
-		ino = fake_ino(tid,PROC_TID_INO);
-
-		do
-			buf[--j] = '0' + (tid % 10);
-		while ((tid /= 10) != 0);
-
-		if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0)
+	/* f_version caches the tgid value that the last readdir call couldn't
+	 * return. lseek aka telldir automagically resets f_version to 0.
+	 */
+	tid = filp->f_version;
+	filp->f_version = 0;
+	for (task = first_tid(leader, tid, pos - 2);
+	     task;
+	     task = next_tid(task), pos++) {
+		int len;
+		tid = task->pid;
+		len = snprintf(buf, sizeof(buf), "%d", tid);
+		ino = fake_ino(tid, PROC_TID_INO);
+		if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
+			/* returning this tgid failed, save it as the first
+			 * pid for the next readir call */
+			filp->f_version = tid;
+			put_task_struct(task);
 			break;
-		pos++;
+		}
 	}
 out:
 	filp->f_pos = pos;
+	put_task_struct(leader);
+out_no_task:
 	return retval;
 }
+
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	generic_fillattr(inode, stat);
+
+	if (p) {
+		rcu_read_lock();
+		stat->nlink += get_nr_threads(p);
+		rcu_read_unlock();
+		put_task_struct(p);
+	}
+
+	return 0;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c46311..6dcef089e18 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
 static void proc_delete_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
-	struct task_struct *tsk;
 
 	truncate_inode_pages(&inode->i_data, 0);
 
-	/* Let go of any associated process */
-	tsk = PROC_I(inode)->task;
-	if (tsk)
-		put_task_struct(tsk);
+	/* Stop tracking associated processes */
+	put_pid(PROC_I(inode)->pid);
 
 	/* Let go of any associated proc directory entry */
 	de = PROC_I(inode)->pde;
@@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
-	ei->task = NULL;
-	ei->type = 0;
+	ei->pid = NULL;
+	ei->fd = 0;
 	ei->op.proc_get_link = NULL;
 	ei->pde = NULL;
 	inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860..146a434ba94 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *);
 extern int proc_pid_status(struct task_struct *, char *);
 extern int proc_pid_statm(struct task_struct *, char *);
 
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+
 void free_proc_entry(struct proc_dir_entry *de);
 
 int proc_init_inodecache(void);
 
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
+{
+	return PROC_I(inode)->pid;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
 {
-	return PROC_I(inode)->task;
+	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
 {
-	return PROC_I(inode)->type;
+	return PROC_I(inode)->fd;
 }
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab37..0137ec4c136 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
 {
 	struct vm_area_struct * vma;
 	int result = -ENOENT;
-	struct task_struct *task = proc_task(inode);
-	struct mm_struct * mm = get_task_mm(task);
+	struct task_struct *task = get_proc_task(inode);
+	struct mm_struct * mm = NULL;
 
+	if (task) {
+		mm = get_task_mm(task);
+		put_task_struct(task);
+	}
 	if (!mm)
 		goto out;
 	down_read(&mm->mmap_sem);
@@ -120,7 +124,8 @@ struct mem_size_stats
 
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
+	struct task_struct *task = priv->task;
 	struct vm_area_struct *vma = v;
 	struct mm_struct *mm = vma->vm_mm;
 	struct file *file = vma->vm_file;
@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	unsigned long last_addr = m->version;
 	struct mm_struct *mm;
-	struct vm_area_struct *vma, *tail_vma;
+	struct vm_area_struct *vma, *tail_vma = NULL;
 	loff_t l = *pos;
 
+	/* Clear the per syscall fields in priv */
+	priv->task = NULL;
+	priv->tail_vma = NULL;
+
 	/*
 	 * We remember last_addr rather than next_addr to hit with
 	 * mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	if (last_addr == -1UL)
 		return NULL;
 
-	mm = get_task_mm(task);
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
 	if (!mm)
 		return NULL;
 
-	tail_vma = get_gate_vma(task);
+	priv->tail_vma = tail_vma = get_gate_vma(priv->task);
 	down_read(&mm->mmap_sem);
 
 	/* Start with last addr hint */
@@ -350,11 +363,9 @@ out:
 	return tail_vma;
 }
 
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
 {
-	struct task_struct *task = m->private;
-	struct vm_area_struct *vma = v;
-	if (vma && vma != get_gate_vma(task)) {
+	if (vma && vma != priv->tail_vma) {
 		struct mm_struct *mm = vma->vm_mm;
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -363,38 +374,103 @@ static void m_stop(struct seq_file *m, void *v)
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct task_struct *task = m->private;
+	struct proc_maps_private *priv = m->private;
 	struct vm_area_struct *vma = v;
-	struct vm_area_struct *tail_vma = get_gate_vma(task);
+	struct vm_area_struct *tail_vma = priv->tail_vma;
 
 	(*pos)++;
 	if (vma && (vma != tail_vma) && vma->vm_next)
 		return vma->vm_next;
-	m_stop(m, v);
+	vma_stop(priv, vma);
 	return (vma != tail_vma)? tail_vma: NULL;
 }
 
-struct seq_operations proc_pid_maps_op = {
+static void m_stop(struct seq_file *m, void *v)
+{
+	struct proc_maps_private *priv = m->private;
+	struct vm_area_struct *vma = v;
+
+	vma_stop(priv, vma);
+	if (priv->task)
+		put_task_struct(priv->task);
+}
+
+static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_map
 };
 
-struct seq_operations proc_pid_smaps_op = {
+static struct seq_operations proc_pid_smaps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_smap
 };
 
+static int do_maps_open(struct inode *inode, struct file *file,
+			struct seq_operations *ops)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
 #ifdef CONFIG_NUMA
 extern int show_numa_map(struct seq_file *m, void *v);
 
-struct seq_operations proc_pid_numa_maps_op = {
+static struct seq_operations proc_pid_numa_maps_op = {
         .start  = m_start,
         .next   = m_next,
         .stop   = m_stop,
         .show   = show_numa_map
 };
+
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+struct file_operations proc_numa_maps_operations = {
+	.open		= numa_maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
 #endif
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+	return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+struct file_operations proc_smaps_operations = {
+	.open		= smaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10..af69f28277b 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	return NULL;
 }
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
 	.show	= show_map
 };
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	ret = seq_open(file, &proc_pid_maps_op);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = NULL;
+	}
+	return ret;
+}
+
+struct file_operations proc_maps_operations = {
+	.open		= maps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf4035..752cea12e30 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 	return res;
 }
 
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
-				  size_t count, loff_t pos)
-{
-	return generic_file_aio_write(iocb, buf, count, pos);
-}
-
 const struct file_operations reiserfs_file_operations = {
 	.read = generic_file_read,
 	.write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
 	.fsync = reiserfs_sync_file,
 	.sendfile = generic_file_sendfile,
 	.aio_read = generic_file_aio_read,
-	.aio_write = reiserfs_aio_write,
+	.aio_write = generic_file_aio_write,
 	.splice_read = generic_file_splice_read,
 	.splice_write = generic_file_splice_write,
 };
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b809..49d1a53dbef 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock,
 		get_bh(bh);
 		if (test_set_buffer_locked(bh)) {
 			if (!buffer_dirty(bh)) {
-				list_del_init(&jh->list);
-				list_add(&jh->list, &tmp);
+				list_move(&jh->list, &tmp);
 				goto loop_next;
 			}
 			spin_unlock(lock);
@@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock,
 			ret = -EIO;
 		}
 		if (buffer_dirty(bh)) {
-			list_del_init(&jh->list);
-			list_add(&jh->list, &tmp);
+			list_move(&jh->list, &tmp);
 			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
 		} else {
 			reiserfs_free_jh(bh);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d3..c8e96195b96 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
 	if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
 		goto out;
 
-	list_del_init(&req->rq_queue);
-	list_add_tail(&req->rq_queue, &server->recvq);
+	list_move_tail(&req->rq_queue, &server->recvq);
 	result = 1;
 out:
 	return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
 	result = smb_request_send_req(req);
 	if (result < 0) {
 		server->conn_error = result;
-		list_del_init(&req->rq_queue);
-		list_add(&req->rq_queue, &server->xmitq);
+		list_move(&req->rq_queue, &server->xmitq);
 		result = -EIO;
 		goto out;
 	}
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 3f71384020c..24577e2c489 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server)
 		if (req->rq_flags & SMB_REQ_RETRY) {
 			/* must move the request to the xmitq */
 			VERBOSE("retrying request %p on recvq\n", req);
-			list_del(&req->rq_queue);
-			list_add(&req->rq_queue, &server->xmitq);
+			list_move(&req->rq_queue, &server->xmitq);
 			continue;
 		}
 #endif
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75..61c42430cba 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			i++;
 			/* fallthrough */
 		default:
-			if (filp->f_pos == 2) {
-				list_del(q);
-				list_add(q, &parent_sd->s_children);
-			}
+			if (filp->f_pos == 2)
+				list_move(q, &parent_sd->s_children);
+
 			for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
 				struct sysfs_dirent *next;
 				const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 						 dt_type(next)) < 0)
 					return 0;
 
-				list_del(q);
-				list_add(q, p);
+				list_move(q, p);
 				p = q;
 				filp->f_pos++;
 			}