27 files changed, 489 insertions, 264 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 23bb0ceabe3..4b42c2bb603 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>	/* generic_writepages */
+#include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 
@@ -336,16 +337,15 @@ out:
 /*
  * Get ref for the oldest snapc for an inode with dirty data... that is, the
  * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
  */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
-						      u64 *snap_size)
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+						    u64 *snap_size)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc = NULL;
 	struct ceph_cap_snap *capsnap = NULL;
 
+	spin_lock(&inode->i_lock);
 	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 		dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
 		     capsnap->context, capsnap->dirty_pages);
@@ -356,21 +356,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
 			break;
 		}
 	}
-	if (!snapc && ci->i_snap_realm) {
-		snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+	if (!snapc && ci->i_head_snapc) {
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
 	}
-	return snapc;
-}
-
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-						    u64 *snap_size)
-{
-	struct ceph_snap_context *snapc = NULL;
-
-	spin_lock(&inode->i_lock);
-	snapc = __get_oldest_context(inode, snap_size);
 	spin_unlock(&inode->i_lock);
 	return snapc;
 }
@@ -391,7 +381,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	int len = PAGE_CACHE_SIZE;
 	loff_t i_size;
 	int err = 0;
-	struct ceph_snap_context *snapc;
+	struct ceph_snap_context *snapc, *oldest;
 	u64 snap_size = 0;
 	long writeback_stat;
 
@@ -412,13 +402,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
 	}
-	if (snapc != get_oldest_context(inode, &snap_size)) {
+	oldest = get_oldest_context(inode, &snap_size);
+	if (snapc->seq > oldest->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
 		     inode, page, (void *)page->private);
 		/* we should only noop if called by kswapd */
 		WARN_ON((current->flags & PF_MEMALLOC) == 0);
+		ceph_put_snap_context(oldest);
 		goto out;
 	}
+	ceph_put_snap_context(oldest);
 
 	/* is this a partial page at end of file? */
 	if (snap_size)
@@ -457,7 +450,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	ClearPagePrivate(page);
 	end_page_writeback(page);
 	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-	ceph_put_snap_context(snapc);
+	ceph_put_snap_context(snapc);  /* page's reference */
 out:
 	return err;
 }
@@ -516,7 +509,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	u64 bytes = 0;
 	struct ceph_client *client = ceph_inode_to_client(inode);
 	long writeback_stat;
-	unsigned issued = __ceph_caps_issued(ci, NULL);
+	unsigned issued = ceph_caps_issued(ci);
 
 	/* parse reply */
 	replyhead = msg->front.iov_base;
@@ -557,9 +550,9 @@ static void writepages_finish(struct ceph_osd_request *req,
 			dout("inode %p skipping page %p\n", inode, page);
 			wbc->pages_skipped++;
 		}
+		ceph_put_snap_context((void *)page->private);
 		page->private = 0;
 		ClearPagePrivate(page);
-		ceph_put_snap_context(snapc);
 		dout("unlocking %d %p\n", i, page);
 		end_page_writeback(page);
 
@@ -617,7 +610,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	int range_whole = 0;
 	int should_loop = 1;
 	pgoff_t max_pages = 0, max_pages_ever = 0;
-	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+	struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
 	struct pagevec pvec;
 	int done = 0;
 	int rc = 0;
@@ -769,9 +762,10 @@ get_more_pages:
 			}
 
 			/* only if matching snap context */
-			if (snapc != (void *)page->private) {
-				dout("page snapc %p != oldest %p\n",
-				     (void *)page->private, snapc);
+			pgsnapc = (void *)page->private;
+			if (pgsnapc->seq > snapc->seq) {
+				dout("page snapc %p %lld > oldest %p %lld\n",
+				     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
 				unlock_page(page);
 				if (!locked_pages)
 					continue; /* keep looking for snap */
@@ -913,12 +907,19 @@ static int context_is_writeable_or_written(struct inode *inode,
 					   struct ceph_snap_context *snapc)
 {
 	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
-	return !oldest || snapc->seq <= oldest->seq;
+	int ret = !oldest || snapc->seq <= oldest->seq;
+
+	ceph_put_snap_context(oldest);
+	return ret;
 }
 
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
  */
 static int ceph_update_writeable_page(struct file *file,
 			    loff_t pos, unsigned len,
@@ -931,8 +932,8 @@ static int ceph_update_writeable_page(struct file *file,
 	int pos_in_page = pos & ~PAGE_CACHE_MASK;
 	int end_in_page = pos_in_page + len;
 	loff_t i_size;
-	struct ceph_snap_context *snapc;
 	int r;
+	struct ceph_snap_context *snapc, *oldest;
 
 retry_locked:
 	/* writepages currently holds page lock, but if we change that later, */
@@ -942,30 +943,34 @@ retry_locked:
 	BUG_ON(!ci->i_snap_realm);
 	down_read(&mdsc->snap_rwsem);
 	BUG_ON(!ci->i_snap_realm->cached_context);
-	if (page->private &&
-	    (void *)page->private != ci->i_snap_realm->cached_context) {
+	snapc = (void *)page->private;
+	if (snapc && snapc != ci->i_head_snapc) {
 		/*
 		 * this page is already dirty in another (older) snap
 		 * context!  is it writeable now?
 		 */
-		snapc = get_oldest_context(inode, NULL);
+		oldest = get_oldest_context(inode, NULL);
 		up_read(&mdsc->snap_rwsem);
 
-		if (snapc != (void *)page->private) {
+		if (snapc->seq > oldest->seq) {
+			ceph_put_snap_context(oldest);
 			dout(" page %p snapc %p not current or oldest\n",
-			     page, (void *)page->private);
+			     page, snapc);
 			/*
 			 * queue for writeback, and wait for snapc to
 			 * be writeable or written
 			 */
-			snapc = ceph_get_snap_context((void *)page->private);
+			snapc = ceph_get_snap_context(snapc);
 			unlock_page(page);
 			ceph_queue_writeback(inode);
-			wait_event_interruptible(ci->i_cap_wq,
+			r = wait_event_interruptible(ci->i_cap_wq,
 			       context_is_writeable_or_written(inode, snapc));
 			ceph_put_snap_context(snapc);
+			if (r == -ERESTARTSYS)
+				return r;
 			return -EAGAIN;
 		}
+		ceph_put_snap_context(oldest);
 
 		/* yay, writeable, do it now (without dropping page lock) */
 		dout(" page %p snapc %p not current, but oldest\n",
@@ -1035,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 	int r;
 
 	do {
-		/* get a page*/
+		/* get a page */
 		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			return -ENOMEM;
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index abb204fea6c..818afe72e6c 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,9 @@
 #include "ceph_debug.h"
 
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #include "types.h"
 #include "auth_none.h"
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index b4ef6f0a6c8..8cd9e3af07f 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 
 #include "auth_none.h"
 #include "auth.h"
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
index 56c05533a31..8164df1a08b 100644
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -1,6 +1,8 @@
 #ifndef _FS_CEPH_AUTH_NONE_H
 #define _FS_CEPH_AUTH_NONE_H
 
+#include <linux/slab.h>
+
 #include "auth.h"
 
 /*
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index f0318427b6d..fee5a08da88 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -4,6 +4,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 
 #include "auth_x.h"
 #include "auth_x_protocol.h"
@@ -11,8 +12,6 @@
 #include "auth.h"
 #include "decode.h"
 
-struct kmem_cache *ceph_x_ticketbuf_cachep;
-
 #define TEMP_TICKET_BUF_LEN	256
 
 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
@@ -28,6 +27,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
 	return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
 
+static int ceph_x_encrypt_buflen(int ilen)
+{
+	return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
+		sizeof(u32);
+}
+
 static int ceph_x_encrypt(struct ceph_crypto_key *secret,
 			  void *ibuf, int ilen, void *obuf, size_t olen)
 {
@@ -124,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 	char *ticket_buf;
 	u8 struct_v;
 
-	dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+	dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
 	if (!dbuf)
 		return -ENOMEM;
 
 	ret = -ENOMEM;
-	ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
-				      GFP_NOFS | GFP_ATOMIC);
+	ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
 	if (!ticket_buf)
 		goto out_dbuf;
 
@@ -150,6 +154,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		struct timespec validity;
 		struct ceph_crypto_key old_key;
 		void *tp, *tpend;
+		struct ceph_timespec new_validity;
+		struct ceph_crypto_key new_session_key;
+		struct ceph_buffer *new_ticket_blob;
+		unsigned long new_expires, new_renew_after;
+		u64 new_secret_id;
 
 		ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
 
@@ -182,16 +191,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 			goto bad;
 
 		memcpy(&old_key, &th->session_key, sizeof(old_key));
-		ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+		ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
 		if (ret)
 			goto out;
 
-		ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
-		ceph_decode_timespec(&validity, &th->validity);
-		th->expires = get_seconds() + validity.tv_sec;
-		th->renew_after = th->expires - (validity.tv_sec / 4);
-		dout(" expires=%lu renew_after=%lu\n", th->expires,
-		     th->renew_after);
+		ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
+		ceph_decode_timespec(&validity, &new_validity);
+		new_expires = get_seconds() + validity.tv_sec;
+		new_renew_after = new_expires - (validity.tv_sec / 4);
+		dout(" expires=%lu renew_after=%lu\n", new_expires,
+		     new_renew_after);
 
 		/* ticket blob for service */
 		ceph_decode_8_safe(&p, end, is_enc, bad);
@@ -216,10 +225,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 		dout(" ticket blob is %d bytes\n", dlen);
 		ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
 		struct_v = ceph_decode_8(&tp);
-		th->secret_id = ceph_decode_64(&tp);
-		ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+		new_secret_id = ceph_decode_64(&tp);
+		ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
 		if (ret)
 			goto out;
+
+		/* all is well, update our ticket */
+		ceph_crypto_key_destroy(&th->session_key);
+		if (th->ticket_blob)
+			ceph_buffer_put(th->ticket_blob);
+		th->session_key = new_session_key;
+		th->ticket_blob = new_ticket_blob;
+		th->validity = new_validity;
+		th->secret_id = new_secret_id;
+		th->expires = new_expires;
+		th->renew_after = new_renew_after;
 		dout(" got ticket service %d (%s) secret_id %lld len %d\n",
 		     type, ceph_entity_type_name(type), th->secret_id,
 		     (int)th->ticket_blob->vec.iov_len);
@@ -228,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 
 	ret = 0;
 out:
-	kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+	kfree(ticket_buf);
 out_dbuf:
-	kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+	kfree(dbuf);
 	return ret;
 
 bad:
@@ -242,7 +262,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 				   struct ceph_x_ticket_handler *th,
 				   struct ceph_x_authorizer *au)
 {
-	int len;
+	int maxlen;
 	struct ceph_x_authorize_a *msg_a;
 	struct ceph_x_authorize_b msg_b;
 	void *p, *end;
@@ -253,15 +273,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 	dout("build_authorizer for %s %p\n",
 	     ceph_entity_type_name(th->service), au);
 
-	len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
-		ticket_blob_len + 16;
-	dout("  need len %d\n", len);
-	if (au->buf && au->buf->alloc_len < len) {
+	maxlen = sizeof(*msg_a) + sizeof(msg_b) +
+		ceph_x_encrypt_buflen(ticket_blob_len);
+	dout("  need len %d\n", maxlen);
+	if (au->buf && au->buf->alloc_len < maxlen) {
 		ceph_buffer_put(au->buf);
 		au->buf = NULL;
 	}
 	if (!au->buf) {
-		au->buf = ceph_buffer_new(len, GFP_NOFS);
+		au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
 		if (!au->buf)
 			return -ENOMEM;
 	}
@@ -296,6 +316,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 	au->buf->vec.iov_len = p - au->buf->vec.iov_base;
 	dout(" built authorizer nonce %llx len %d\n", au->nonce,
 	     (int)au->buf->vec.iov_len);
+	BUG_ON(au->buf->vec.iov_len > maxlen);
 	return 0;
 
 out_buf:
@@ -581,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
 		remove_ticket_handler(ac, th);
 	}
 
-	kmem_cache_destroy(ceph_x_ticketbuf_cachep);
-
 	kfree(ac->private);
 	ac->private = NULL;
 }
@@ -617,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac)
 	int ret;
 
 	dout("ceph_x_init %p\n", ac);
+	ret = -ENOMEM;
 	xi = kzalloc(sizeof(*xi), GFP_NOFS);
 	if (!xi)
-		return -ENOMEM;
+		goto out;
 
-	ret = -ENOMEM;
-	ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
-				      TEMP_TICKET_BUF_LEN, 8,
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      NULL);
-	if (!ceph_x_ticketbuf_cachep)
-		goto done_nomem;
 	ret = -EINVAL;
 	if (!ac->secret) {
 		pr_err("no secret set (for auth_x protocol)\n");
-		goto done_nomem;
+		goto out_nomem;
 	}
 
 	ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
 	if (ret)
-		goto done_nomem;
+		goto out_nomem;
 
 	xi->starting = true;
 	xi->ticket_handlers = RB_ROOT;
@@ -646,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac)
 	ac->ops = &ceph_x_ops;
 	return 0;
 
-done_nomem:
+out_nomem:
 	kfree(xi);
-	if (ceph_x_ticketbuf_cachep)
-		kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+out:
 	return ret;
 }
 
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index b98086c7aeb..c67535d70aa 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -1,5 +1,8 @@
 
 #include "ceph_debug.h"
+
+#include <linux/slab.h>
+
 #include "buffer.h"
 #include "decode.h"
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index db122bb357b..0c168180686 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
@@ -1204,6 +1205,12 @@ retry:
 		if (capsnap->dirty_pages || capsnap->writing)
 			continue;
 
+		/*
+		 * if cap writeback already occurred, we should have dropped
+		 * the capsnap in ceph_put_wrbuffer_cap_refs.
+		 */
+		BUG_ON(capsnap->dirty == 0);
+
 		/* pick mds, take s_mutex */
 		mds = __ceph_get_cap_mds(ci, &mseq);
 		if (session && session->s_mds != mds) {
@@ -1407,6 +1414,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
  */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 		     struct ceph_mds_session *session)
+	__releases(session->s_mutex)
 {
 	struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
 	struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1414,7 +1422,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
 	struct ceph_cap *cap;
 	int file_wanted, used;
 	int took_snap_rwsem = 0;             /* true if mdsc->snap_rwsem held */
-	int drop_session_lock = session ? 0 : 1;
 	int issued, implemented, want, retain, revoking, flushing = 0;
 	int mds = -1;   /* keep track of how far we've gone through i_caps list
 			   to avoid an infinite loop on retry */
@@ -1639,7 +1646,7 @@ ack:
 	if (queue_invalidate)
 		ceph_queue_invalidate(inode);
 
-	if (session && drop_session_lock)
+	if (session)
 		mutex_unlock(&session->s_mutex);
 	if (took_snap_rwsem)
 		up_read(&mdsc->snap_rwsem);
@@ -1854,8 +1861,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
 		} else {
 			pr_err("%p auth cap %p not mds%d ???\n", inode,
 			       cap, session->s_mds);
-			spin_unlock(&inode->i_lock);
 		}
+		spin_unlock(&inode->i_lock);
 	}
 }
 
@@ -2117,8 +2124,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
 		}
 	spin_unlock(&inode->i_lock);
 
-	dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
-	     last ? "last" : "");
+	dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
+	     last ? " last" : "", put ? " put" : "");
 
 	if (last && !flushsnaps)
 		ceph_check_caps(ci, 0, NULL);
@@ -2142,7 +2149,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 {
 	struct inode *inode = &ci->vfs_inode;
 	int last = 0;
-	int last_snap = 0;
+	int complete_capsnap = 0;
+	int drop_capsnap = 0;
 	int found = 0;
 	struct ceph_cap_snap *capsnap = NULL;
 
@@ -2165,19 +2173,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
 			if (capsnap->context == snapc) {
 				found = 1;
-				capsnap->dirty_pages -= nr;
-				last_snap = !capsnap->dirty_pages;
 				break;
 			}
 		}
 		BUG_ON(!found);
+		capsnap->dirty_pages -= nr;
+		if (capsnap->dirty_pages == 0) {
+			complete_capsnap = 1;
+			if (capsnap->dirty == 0)
+				/* cap writeback completed before we created
+				 * the cap_snap; no FLUSHSNAP is needed */
+				drop_capsnap = 1;
+		}
 		dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-		     " snap %lld %d/%d -> %d/%d %s%s\n",
+		     " snap %lld %d/%d -> %d/%d %s%s%s\n",
 		     inode, capsnap, capsnap->context->seq,
 		     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
 		     ci->i_wrbuffer_ref, capsnap->dirty_pages,
 		     last ? " (wrbuffer last)" : "",
-		     last_snap ? " (capsnap last)" : "");
+		     complete_capsnap ? " (complete capsnap)" : "",
+		     drop_capsnap ? " (drop capsnap)" : "");
+		if (drop_capsnap) {
+			ceph_put_snap_context(capsnap->context);
+			list_del(&capsnap->ci_item);
+			list_del(&capsnap->flushing_item);
+			ceph_put_cap_snap(capsnap);
+		}
 	}
 
 	spin_unlock(&inode->i_lock);
@@ -2185,28 +2206,31 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 	if (last) {
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 		iput(inode);
-	} else if (last_snap) {
+	} else if (complete_capsnap) {
 		ceph_flush_snaps(ci);
 		wake_up(&ci->i_cap_wq);
 	}
+	if (drop_capsnap)
+		iput(inode);
 }
 
 /*
  * Handle a cap GRANT message from the MDS.  (Note that a GRANT may
  * actually be a revocation if it specifies a smaller cap set.)
  *
- * caller holds s_mutex.
+ * caller holds s_mutex and i_lock, we drop both.
+ *
  * return value:
  *  0 - ok
  *  1 - check_caps on auth cap only (writeback)
  *  2 - check_caps (ack revoke)
  */
-static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
-			    struct ceph_mds_session *session,
-			    struct ceph_cap *cap,
-			    struct ceph_buffer *xattr_buf)
+static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+			     struct ceph_mds_session *session,
+			     struct ceph_cap *cap,
+			     struct ceph_buffer *xattr_buf)
 	__releases(inode->i_lock)
-
+	__releases(session->s_mutex)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	int mds = session->s_mds;
@@ -2216,7 +2240,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 	u64 size = le64_to_cpu(grant->size);
 	u64 max_size = le64_to_cpu(grant->max_size);
 	struct timespec mtime, atime, ctime;
-	int reply = 0;
+	int check_caps = 0;
 	int wake = 0;
 	int writeback = 0;
 	int revoked_rdcache = 0;
@@ -2329,11 +2353,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
 			writeback = 1; /* will delay ack */
 		else if (dirty & ~newcaps)
-			reply = 1;     /* initiate writeback in check_caps */
+			check_caps = 1;  /* initiate writeback in check_caps */
 		else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
 			   revoked_rdcache)
-			reply = 2;     /* send revoke ack in check_caps */
+			check_caps = 2;     /* send revoke ack in check_caps */
 		cap->issued = newcaps;
+		cap->implemented |= newcaps;
 	} else if (cap->issued == newcaps) {
 		dout("caps unchanged: %s -> %s\n",
 		     ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
@@ -2346,6 +2371,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 					      * pending revocation */
 		wake = 1;
 	}
+	BUG_ON(cap->issued & ~cap->implemented);
 
 	spin_unlock(&inode->i_lock);
 	if (writeback)
@@ -2359,7 +2385,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
 		ceph_queue_invalidate(inode);
 	if (wake)
 		wake_up(&ci->i_cap_wq);
-	return reply;
+
+	if (check_caps == 1)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+				session);
+	else if (check_caps == 2)
+		ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
+	else
+		mutex_unlock(&session->s_mutex);
 }
 
 /*
@@ -2454,8 +2487,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
 				break;
 			}
 			WARN_ON(capsnap->dirty_pages || capsnap->writing);
-			dout(" removing cap_snap %p follows %lld\n",
-			     capsnap, follows);
+			dout(" removing %p cap_snap %p follows %lld\n",
+			     inode, capsnap, follows);
 			ceph_put_snap_context(capsnap->context);
 			list_del(&capsnap->ci_item);
 			list_del(&capsnap->flushing_item);
@@ -2548,9 +2581,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			ci->i_cap_exporting_issued = cap->issued;
 		}
 		__ceph_remove_cap(cap);
-	} else {
-		WARN_ON(!cap);
 	}
+	/* else, we already released it */
 
 	spin_unlock(&inode->i_lock);
 }
@@ -2621,9 +2653,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u64 cap_id;
 	u64 size, max_size;
 	u64 tid;
-	int check_caps = 0;
 	void *snaptrace;
-	int r;
 
 	dout("handle_caps from mds%d\n", mds);
 
@@ -2668,8 +2698,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_IMPORT:
 		handle_cap_import(mdsc, inode, h, session,
 				  snaptrace, le32_to_cpu(h->snap_trace_len));
-		check_caps = 1; /* we may have sent a RELEASE to the old auth */
-		goto done;
+		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
+				session);
+		goto done_unlocked;
 	}
 
 	/* the rest require a cap */
@@ -2686,16 +2717,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	switch (op) {
 	case CEPH_CAP_OP_REVOKE:
 	case CEPH_CAP_OP_GRANT:
-		r = handle_cap_grant(inode, h, session, cap, msg->middle);
-		if (r == 1)
-			ceph_check_caps(ceph_inode(inode),
-					CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
-					session);
-		else if (r == 2)
-			ceph_check_caps(ceph_inode(inode),
-					CHECK_CAPS_NODELAY,
-					session);
-		break;
+		handle_cap_grant(inode, h, session, cap, msg->middle);
+		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
 		handle_cap_flush_ack(inode, tid, h, session, cap);
@@ -2713,9 +2736,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 
 done:
 	mutex_unlock(&session->s_mutex);
-
-	if (check_caps)
-		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+done_unlocked:
 	if (inode)
 		iput(inode);
 	return;
@@ -2838,11 +2859,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 	struct ceph_cap *cap;
 	struct ceph_mds_request_release *rel = *p;
 	int ret = 0;
-
-	dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
-	     mds, ceph_cap_string(drop), ceph_cap_string(unless));
+	int used = 0;
 
 	spin_lock(&inode->i_lock);
+	used = __ceph_caps_used(ci);
+
+	dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
+	     mds, ceph_cap_string(used), ceph_cap_string(drop),
+	     ceph_cap_string(unless));
+
+	/* only drop unused caps */
+	drop &= ~used;
+
 	cap = __get_cap_for_mds(ci, mds);
 	if (cap && __cap_is_valid(cap)) {
 		if (force ||
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 291ac288e79..f704b3b6242 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -3,6 +3,7 @@
 
 #include <linux/err.h>
 #include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <crypto/hash.h>
 
 #include "crypto.h"
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index e159f141511..f7048da92ac 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 5107384ee02..650d2db5ed2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -3,6 +3,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs_struct.h>
 #include <linux/namei.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 
 #include "super.h"
@@ -170,11 +171,11 @@ more:
 	spin_lock(&inode->i_lock);
 	spin_lock(&dcache_lock);
 
+	last = dentry;
+
 	if (err < 0)
 		goto out_unlock;
 
-	last = dentry;
-
 	p = p->prev;
 	filp->f_pos++;
 
@@ -288,8 +289,10 @@ more:
 			CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
 
 		/* discard old result, if any */
-		if (fi->last_readdir)
+		if (fi->last_readdir) {
 			ceph_mdsc_put_request(fi->last_readdir);
+			fi->last_readdir = NULL;
+		}
 
 		/* requery frag tree, as the frag topology may have changed */
 		frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
@@ -309,7 +312,7 @@ more:
 		req->r_readdir_offset = fi->next_offset;
 		req->r_args.readdir.frag = cpu_to_le32(frag);
 		req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-		req->r_num_caps = max_entries;
+		req->r_num_caps = max_entries + 1;
 		err = ceph_mdsc_do_request(mdsc, NULL, req);
 		if (err < 0) {
 			ceph_mdsc_put_request(req);
@@ -486,6 +489,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 		struct inode *inode = ceph_get_snapdir(parent);
 		dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
 		     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+		BUG_ON(!d_unhashed(dentry));
 		d_add(dentry, inode);
 		err = 0;
 	}
@@ -876,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * do_request, above).  If there is no trace, we need
 		 * to do it here.
 		 */
+
+		/* d_move screws up d_subdirs order */
+		ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+
 		d_move(old_dentry, new_dentry);
+
+		/* ensure target dentry is invalidated, despite
+		   rehashing bug in vfs_rename_dir */
+		new_dentry->d_time = jiffies;
+		ceph_dentry(new_dentry)->lease_shared_gen = 0;
 	}
 	ceph_mdsc_put_request(req);
 	return err;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fc68e39cbad..9d67572fb32 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/exportfs.h>
+#include <linux/slab.h>
 #include <asm/unaligned.h>
 
 #include "super.h"
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5d2af8464f6..ed6f19721d6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
@@ -664,7 +665,8 @@ more:
 		 * throw out any page cache pages in this range. this
 		 * may block.
 		 */
-		truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+		truncate_inode_pages_range(inode->i_mapping, pos, 
+					   (pos+len) | (PAGE_CACHE_SIZE-1));
 	} else {
 		pages = alloc_page_vector(num_pages);
 		if (IS_ERR(pages)) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7abe1aed819..261f3e6c0bc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode)
 
 	ceph_queue_caps_release(inode);
 
+	/*
+	 * we may still have a snap_realm reference if there are stray
+	 * caps in i_cap_exporting_issued or i_snap_caps.
+	 */
+	if (ci->i_snap_realm) {
+		struct ceph_mds_client *mdsc =
+			&ceph_client(ci->vfs_inode.i_sb)->mdsc;
+		struct ceph_snap_realm *realm = ci->i_snap_realm;
+
+		dout(" dropping residual ref to snap realm %p\n", realm);
+		spin_lock(&realm->inodes_with_caps_lock);
+		list_del_init(&ci->i_snap_realm_item);
+		spin_unlock(&realm->inodes_with_caps_lock);
+		ceph_put_snap_realm(mdsc, realm);
+	}
+
 	kfree(ci->i_symlink);
 	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
 		frag = rb_entry(n, struct ceph_inode_frag, node);
@@ -870,6 +886,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct inode *in = NULL;
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
+	struct ceph_client *client = ceph_sb_to_client(sb);
 	int i = 0;
 	int err = 0;
 
@@ -933,7 +950,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			return err;
 	}
 
-	if (rinfo->head->is_dentry && !req->r_aborted) {
+	/*
+	 * ignore null lease/binding on snapdir ENOENT, or else we
+	 * will have trouble splicing in the virtual snapdir later
+	 */
+	if (rinfo->head->is_dentry && !req->r_aborted &&
+	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+					       client->mount_args->snapdir_name,
+					       req->r_dentry->d_name.len))) {
 		/*
 		 * lookup link rename   : null -> possibly existing inode
 		 * mknod symlink mkdir  : null -> new inode
@@ -973,6 +997,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			     dn, dn->d_name.len, dn->d_name.name);
 			dout("fill_trace doing d_move %p -> %p\n",
 			     req->r_old_dentry, dn);
+
+			/* d_move screws up d_subdirs order */
+			ceph_i_clear(dir, CEPH_I_COMPLETE);
+
 			d_move(req->r_old_dentry, dn);
 			dout(" src %p '%.*s' dst %p '%.*s'\n",
 			     req->r_old_dentry,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a2600101ec2..60a9a4ae47b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/wait.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 
 #include "mds_client.h"
@@ -328,6 +329,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *s;
 
 	s = kzalloc(sizeof(*s), GFP_NOFS);
+	if (!s)
+		return ERR_PTR(-ENOMEM);
 	s->s_mdsc = mdsc;
 	s->s_mds = mds;
 	s->s_state = CEPH_MDS_SESSION_NEW;
@@ -529,7 +532,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 {
 	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
 	rb_erase(&req->r_node, &mdsc->request_tree);
-	ceph_mdsc_put_request(req);
+	RB_CLEAR_NODE(&req->r_node);
 
 	if (req->r_unsafe_dir) {
 		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -538,6 +541,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		list_del_init(&req->r_unsafe_dir_item);
 		spin_unlock(&ci->i_unsafe_lock);
 	}
+
+	ceph_mdsc_put_request(req);
 }
 
 /*
@@ -862,6 +867,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 	if (time_after_eq(jiffies, session->s_cap_ttl) &&
 	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
 		pr_info("mds%d caps stale\n", session->s_mds);
+	session->s_renew_requested = jiffies;
 
 	/* do not try to renew caps until a recovering mds has reconnected
 	 * with its clients. */
@@ -874,7 +880,6 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
 
 	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
 		ceph_mds_state_name(state));
-	session->s_renew_requested = jiffies;
 	msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
 				 ++session->s_renew_seq);
 	if (IS_ERR(msg))
@@ -1566,8 +1571,13 @@ static int __do_request(struct ceph_mds_client *mdsc,
 
 	/* get, open session */
 	session = __ceph_lookup_mds_session(mdsc, mds);
-	if (!session)
+	if (!session) {
 		session = register_session(mdsc, mds);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto finish;
+		}
+	}
 	dout("do_request mds%d session %p state %s\n", mds, session,
 	     session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1770,7 +1780,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	dout("handle_reply %p\n", req);
 
 	/* correct session? */
-	if (!req->r_session && req->r_session != session) {
+	if (req->r_session != session) {
 		pr_err("mdsc_handle_reply got %llu on session mds%d"
 		       " not mds%d\n", tid, session->s_mds,
 		       req->r_session ? req->r_session->s_mds : -1);
@@ -2682,29 +2692,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
  */
 static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
 {
-	struct ceph_mds_request *req = NULL;
+	struct ceph_mds_request *req = NULL, *nextreq;
 	struct rb_node *n;
 
 	mutex_lock(&mdsc->mutex);
 	dout("wait_unsafe_requests want %lld\n", want_tid);
+restart:
 	req = __get_oldest_req(mdsc);
 	while (req && req->r_tid <= want_tid) {
+		/* find next request */
+		n = rb_next(&req->r_node);
+		if (n)
+			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
+		else
+			nextreq = NULL;
 		if ((req->r_op & CEPH_MDS_OP_WRITE)) {
 			/* write op */
 			ceph_mdsc_get_request(req);
+			if (nextreq)
+				ceph_mdsc_get_request(nextreq);
 			mutex_unlock(&mdsc->mutex);
 			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
 			     req->r_tid, want_tid);
 			wait_for_completion(&req->r_safe_completion);
 			mutex_lock(&mdsc->mutex);
-			n = rb_next(&req->r_node);
 			ceph_mdsc_put_request(req);
-		} else {
-			n = rb_next(&req->r_node);
+			if (!nextreq)
+				break;  /* next dne before, so we're done! */
+			if (RB_EMPTY_NODE(&nextreq->r_node)) {
+				/* next request was removed from tree */
+				ceph_mdsc_put_request(nextreq);
+				goto restart;
+			}
+			ceph_mdsc_put_request(nextreq);  /* won't go away */
 		}
-		if (!n)
-			break;
-		req = rb_entry(n, struct ceph_mds_request, r_node);
+		req = nextreq;
 	}
 	mutex_unlock(&mdsc->mutex);
 	dout("wait_unsafe_requests done\n");
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 781656a49bf..509f57d9ccb 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -6,6 +6,7 @@
 #include <linux/inet.h>
 #include <linux/kthread.h>
 #include <linux/net.h>
+#include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
 #include <net/tcp.h>
@@ -29,6 +30,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
 
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
+
 
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
@@ -227,6 +232,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 	con->sock = sock;
 	sock->sk->sk_allocation = GFP_NOFS;
 
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
+
 	set_sock_callbacks(sock, con);
 
 	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
@@ -332,6 +341,7 @@ static void reset_connection(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 	con->in_seq = 0;
+	con->in_seq_acked = 0;
 }
 
 /*
@@ -366,6 +376,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
 }
 
 /*
+ * return true if this connection ever successfully opened
+ */
+bool ceph_con_opened(struct ceph_connection *con)
+{
+	return con->connect_seq > 0;
+}
+
+/*
  * generic get/put
  */
 struct ceph_connection *ceph_con_get(struct ceph_connection *con)
@@ -830,13 +848,6 @@ static void prepare_read_connect(struct ceph_connection *con)
 	con->in_base_pos = 0;
 }
 
-static void prepare_read_connect_retry(struct ceph_connection *con)
-{
-	dout("prepare_read_connect_retry %p\n", con);
-	con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
-		+ sizeof(con->peer_addr_for_me);
-}
-
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
@@ -1146,7 +1157,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 		con->auth_retry = 1;
 		prepare_write_connect(con->msgr, con, 0);
-		prepare_read_connect_retry(con);
+		prepare_read_connect(con);
 		break;
 
 	case CEPH_MSGR_TAG_RESETSESSION:
@@ -1323,6 +1334,7 @@ static int read_partial_message(struct ceph_connection *con)
 	unsigned front_len, middle_len, data_len, data_off;
 	int datacrc = con->msgr->nocrc;
 	int skip;
+	u64 seq;
 
 	dout("read_partial_message con %p msg %p\n", con, m);
 
@@ -1357,6 +1369,25 @@ static int read_partial_message(struct ceph_connection *con)
 		return -EIO;
 	data_off = le16_to_cpu(con->in_hdr.data_off);
 
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			pr_addr(&con->peer_addr.in_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof(m->footer);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
+		return 0;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADMSG;
+	}
+
 	/* allocate message? */
 	if (!con->in_msg) {
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
@@ -1368,6 +1399,7 @@ static int read_partial_message(struct ceph_connection *con)
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
 			return 0;
 		}
 		if (IS_ERR(con->in_msg)) {
@@ -1843,8 +1875,6 @@ static void ceph_fault(struct ceph_connection *con)
 		goto out;
 	}
 
-	clear_bit(BUSY, &con->state);  /* to avoid an improbable race */
-
 	mutex_lock(&con->mutex);
 	if (test_bit(CLOSED, &con->state))
 		goto out_unlock;
@@ -2021,6 +2051,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->in_seq++;
 	} else {
 		dout("con_revoke_pages %p msg %p pages %p no-op\n",
 		     con, con->in_msg, msg);
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 4caaa591111..a343dae73cd 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr,
 			  struct ceph_connection *con);
 extern void ceph_con_open(struct ceph_connection *con,
 			  struct ceph_entity_addr *addr);
+extern bool ceph_con_opened(struct ceph_connection *con);
 extern void ceph_con_close(struct ceph_connection *con);
 extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
 extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 890597c09d4..8fdc011ca95 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/types.h>
+#include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/sched.h>
 
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index dbe63db9762..c7b4dedaace 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -413,11 +413,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
  */
 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
+	struct ceph_osd_request *req;
 	int ret = 0;
 
 	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests)) {
 		__remove_osd(osdc, osd);
+	} else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
+			  &osd->o_con.peer_addr,
+			  sizeof(osd->o_con.peer_addr)) == 0 &&
+		   !ceph_con_opened(&osd->o_con)) {
+		dout(" osd addr hasn't changed and connection never opened,"
+		     " letting msgr retry");
+		/* touch each r_stamp for handle_timeout()'s benfit */
+		list_for_each_entry(req, &osd->o_requests, r_osd_item)
+			req->r_stamp = jiffies;
+		ret = -EAGAIN;
 	} else {
 		ceph_con_close(&osd->o_con);
 		ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
@@ -633,7 +644,7 @@ static int __send_request(struct ceph_osd_client *osdc,
 	reqhead->flags |= cpu_to_le32(req->r_flags);  /* e.g., RETRY */
 	reqhead->reassert_version = req->r_reassert_version;
 
-	req->r_sent_stamp = jiffies;
+	req->r_stamp = jiffies;
 	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
@@ -660,7 +671,7 @@ static void handle_timeout(struct work_struct *work)
 	unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
 	unsigned long keepalive =
 		osdc->client->mount_args->osd_keepalive_timeout * HZ;
-	unsigned long last_sent = 0;
+	unsigned long last_stamp = 0;
 	struct rb_node *p;
 	struct list_head slow_osds;
 
@@ -697,12 +708,12 @@ static void handle_timeout(struct work_struct *work)
 		req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
 				 r_req_lru_item);
 
-		if (time_before(jiffies, req->r_sent_stamp + timeout))
+		if (time_before(jiffies, req->r_stamp + timeout))
 			break;
 
-		BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
+		BUG_ON(req == last_req && req->r_stamp == last_stamp);
 		last_req = req;
-		last_sent = req->r_sent_stamp;
+		last_stamp = req->r_stamp;
 
 		osd = req->r_osd;
 		BUG_ON(!osd);
@@ -718,7 +729,7 @@ static void handle_timeout(struct work_struct *work)
 	 */
 	INIT_LIST_HEAD(&slow_osds);
 	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_sent_stamp + keepalive))
+		if (time_before(jiffies, req->r_stamp + keepalive))
 			break;
 
 		osd = req->r_osd;
@@ -862,7 +873,9 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 	dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
 	if (kickosd) {
-		__reset_osd(osdc, kickosd);
+		err = __reset_osd(osdc, kickosd);
+		if (err == -EAGAIN)
+			return 1;
 	} else {
 		for (p = rb_first(&osdc->osds); p; p = n) {
 			struct ceph_osd *osd =
@@ -913,7 +926,7 @@ static int __kick_requests(struct ceph_osd_client *osdc,
 
 kick:
 		dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
-		     req->r_osd->o_osd);
+		     req->r_osd ? req->r_osd->o_osd : -1);
 		req->r_flags |= CEPH_OSD_FLAG_RETRY;
 		err = __send_request(osdc, req);
 		if (err) {
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index 1b1a3ca43af..b0759911e7c 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -70,7 +70,7 @@ struct ceph_osd_request {
 
 	char              r_oid[40];          /* object name */
 	int               r_oid_len;
-	unsigned long     r_sent_stamp;
+	unsigned long     r_stamp;            /* send OR check time */
 	bool              r_resend;           /* msg send failed, needs retry */
 
 	struct ceph_file_layout r_file_layout;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index b83f2692b83..2e2c15eed82 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1,4 +1,7 @@
 
+#include "ceph_debug.h"
+
+#include <linux/slab.h>
 #include <asm/div64.h>
 
 #include "super.h"
@@ -6,7 +9,6 @@
 #include "crush/hash.h"
 #include "crush/mapper.h"
 #include "decode.h"
-#include "ceph_debug.h"
 
 char *ceph_osdmap_state_str(char *str, int len, int state)
 {
@@ -312,71 +314,6 @@ bad:
 	return ERR_PTR(err);
 }
 
-
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-	dout("osdmap_destroy %p\n", map);
-	if (map->crush)
-		crush_destroy(map->crush);
-	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-		struct ceph_pg_mapping *pg =
-			rb_entry(rb_first(&map->pg_temp),
-				 struct ceph_pg_mapping, node);
-		rb_erase(&pg->node, &map->pg_temp);
-		kfree(pg);
-	}
-	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-		struct ceph_pg_pool_info *pi =
-			rb_entry(rb_first(&map->pg_pools),
-				 struct ceph_pg_pool_info, node);
-		rb_erase(&pi->node, &map->pg_pools);
-		kfree(pi);
-	}
-	kfree(map->osd_state);
-	kfree(map->osd_weight);
-	kfree(map->osd_addr);
-	kfree(map);
-}
-
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-	u8 *state;
-	struct ceph_entity_addr *addr;
-	u32 *weight;
-
-	state = kcalloc(max, sizeof(*state), GFP_NOFS);
-	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-	if (state == NULL || addr == NULL || weight == NULL) {
-		kfree(state);
-		kfree(addr);
-		kfree(weight);
-		return -ENOMEM;
-	}
-
-	/* copy old? */
-	if (map->osd_state) {
-		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-		kfree(map->osd_state);
-		kfree(map->osd_addr);
-		kfree(map->osd_weight);
-	}
-
-	map->osd_state = state;
-	map->osd_weight = weight;
-	map->osd_addr = addr;
-	map->max_osd = max;
-	return 0;
-}
-
 /*
  * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
  * to a set of osds)
@@ -480,6 +417,113 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
 	return NULL;
 }
 
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+	rb_erase(&pi->node, root);
+	kfree(pi->name);
+	kfree(pi);
+}
+
+void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+{
+	ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+	calc_pg_masks(pi);
+	*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+	*p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+}
+
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+	struct ceph_pg_pool_info *pi;
+	u32 num, len, pool;
+
+	ceph_decode_32_safe(p, end, num, bad);
+	dout(" %d pool names\n", num);
+	while (num--) {
+		ceph_decode_32_safe(p, end, pool, bad);
+		ceph_decode_32_safe(p, end, len, bad);
+		dout("  pool %d len %d\n", pool, len);
+		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		if (pi) {
+			kfree(pi->name);
+			pi->name = kmalloc(len + 1, GFP_NOFS);
+			if (pi->name) {
+				memcpy(pi->name, *p, len);
+				pi->name[len] = '\0';
+				dout("  name is %s\n", pi->name);
+			}
+		}
+		*p += len;
+	}
+	return 0;
+
+bad:
+	return -EINVAL;
+}
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+	dout("osdmap_destroy %p\n", map);
+	if (map->crush)
+		crush_destroy(map->crush);
+	while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+		struct ceph_pg_mapping *pg =
+			rb_entry(rb_first(&map->pg_temp),
+				 struct ceph_pg_mapping, node);
+		rb_erase(&pg->node, &map->pg_temp);
+		kfree(pg);
+	}
+	while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+		struct ceph_pg_pool_info *pi =
+			rb_entry(rb_first(&map->pg_pools),
+				 struct ceph_pg_pool_info, node);
+		__remove_pg_pool(&map->pg_pools, pi);
+	}
+	kfree(map->osd_state);
+	kfree(map->osd_weight);
+	kfree(map->osd_addr);
+	kfree(map);
+}
+
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+	u8 *state;
+	struct ceph_entity_addr *addr;
+	u32 *weight;
+
+	state = kcalloc(max, sizeof(*state), GFP_NOFS);
+	addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+	weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+	if (state == NULL || addr == NULL || weight == NULL) {
+		kfree(state);
+		kfree(addr);
+		kfree(weight);
+		return -ENOMEM;
+	}
+
+	/* copy old? */
+	if (map->osd_state) {
+		memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+		memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+		memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+		kfree(map->osd_state);
+		kfree(map->osd_addr);
+		kfree(map->osd_weight);
+	}
+
+	map->osd_state = state;
+	map->osd_weight = weight;
+	map->osd_addr = addr;
+	map->max_osd = max;
+	return 0;
+}
+
 /*
  * decode a full map.
  */
@@ -516,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 	ceph_decode_32_safe(p, end, max, bad);
 	while (max--) {
 		ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-		pi = kmalloc(sizeof(*pi), GFP_NOFS);
+		pi = kzalloc(sizeof(*pi), GFP_NOFS);
 		if (!pi)
 			goto bad;
 		pi->id = ceph_decode_32(p);
@@ -526,13 +570,13 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 				   ev, CEPH_PG_POOL_VERSION);
 			goto bad;
 		}
-		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+		__decode_pool(p, pi);
 		__insert_pg_pool(&map->pg_pools, pi);
-		calc_pg_masks(pi);
-		*p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
-		*p += le32_to_cpu(pi->v.num_removed_snap_intervals)
-			* sizeof(u64) * 2;
 	}
+
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
+
 	ceph_decode_32_safe(p, end, map->pool_max, bad);
 
 	ceph_decode_32_safe(p, end, map->flags, bad);
@@ -706,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		}
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
 		if (!pi) {
-			pi = kmalloc(sizeof(*pi), GFP_NOFS);
+			pi = kzalloc(sizeof(*pi), GFP_NOFS);
 			if (!pi) {
 				err = -ENOMEM;
 				goto bad;
@@ -714,9 +758,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 			pi->id = pool;
 			__insert_pg_pool(&map->pg_pools, pi);
 		}
-		ceph_decode_copy(p, &pi->v, sizeof(pi->v));
-		calc_pg_masks(pi);
+		__decode_pool(p, pi);
 	}
+	if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+		goto bad;
 
 	/* old_pool */
 	ceph_decode_32_safe(p, end, len, bad);
@@ -725,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 
 		ceph_decode_32_safe(p, end, pool, bad);
 		pi = __lookup_pg_pool(&map->pg_pools, pool);
-		if (pi) {
-			rb_erase(&pi->node, &map->pg_pools);
-			kfree(pi);
-		}
+		if (pi)
+			__remove_pg_pool(&map->pg_pools, pi);
 	}
 
 	/* new_up */
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb264..8bc9f1e4f56 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
 	int id;
 	struct ceph_pg_pool v;
 	int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+	char *name;
 };
 
 struct ceph_pg_mapping {
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 370e9369547..5f8dbf7c745 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -1,4 +1,5 @@
 
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a67..a1fc1d017b5 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
 /*
  * osdmap encoding versions
  */
-#define CEPH_OSDMAP_INC_VERSION 4
-#define CEPH_OSDMAP_VERSION     4
+#define CEPH_OSDMAP_INC_VERSION     5
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
 
 /*
  * fs id
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index bf2a5f3846a..d5114db7045 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,6 +1,7 @@
 #include "ceph_debug.h"
 
 #include <linux/sort.h>
+#include <linux/slab.h>
 
 #include "super.h"
 #include "decode.h"
@@ -314,9 +315,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
 	   because we rebuild_snap_realms() works _downward_ in
 	   hierarchy after each update.) */
 	if (realm->cached_context &&
-	    realm->cached_context->seq <= realm->seq &&
+	    realm->cached_context->seq == realm->seq &&
 	    (!parent ||
-	     realm->cached_context->seq <= parent->cached_context->seq)) {
+	     realm->cached_context->seq >= parent->cached_context->seq)) {
 		dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
 		     " (unchanged)\n",
 		     realm->ino, realm, realm->cached_context,
@@ -430,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
  * Caller must hold snap_rwsem for read (i.e., the realm topology won't
  * change).
  */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci,
-			 struct ceph_snap_context *snapc)
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
@@ -450,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		   as no new writes are allowed to start when pending, so any
 		   writes in progress now were started before the previous
 		   cap_snap.  lucky us. */
-		dout("queue_cap_snap %p snapc %p seq %llu used %d"
-		     " already pending\n", inode, snapc, snapc->seq, used);
+		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
 	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+		struct ceph_snap_context *snapc = ci->i_head_snapc;
+
 		igrab(inode);
 
 		atomic_set(&capsnap->nref, 1);
@@ -462,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		INIT_LIST_HEAD(&capsnap->flushing_item);
 
 		capsnap->follows = snapc->seq - 1;
-		capsnap->context = ceph_get_snap_context(snapc);
 		capsnap->issued = __ceph_caps_issued(ci, NULL);
 		capsnap->dirty = __ceph_caps_dirty(ci);
 
@@ -479,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
 		   snapshot. */
 		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
 		ci->i_wrbuffer_ref_head = 0;
-		ceph_put_snap_context(ci->i_head_snapc);
+		capsnap->context = snapc;
 		ci->i_head_snapc = NULL;
 		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
 
@@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	capsnap->ctime = inode->i_ctime;
 	capsnap->time_warp_seq = ci->i_time_warp_seq;
 	if (capsnap->dirty_pages) {
-		dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
 		     "still has %d dirty pages\n", inode, capsnap,
 		     capsnap->context, capsnap->context->seq,
-		     capsnap->size, capsnap->dirty_pages);
+		     ceph_cap_string(capsnap->dirty), capsnap->size,
+		     capsnap->dirty_pages);
 		return 0;
 	}
-	dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+	dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
 	     inode, capsnap, capsnap->context,
-	     capsnap->context->seq, capsnap->size);
+	     capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+	     capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
 	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
@@ -601,7 +603,7 @@ more:
 				if (lastinode)
 					iput(lastinode);
 				lastinode = inode;
-				ceph_queue_cap_snap(ci, realm->cached_context);
+				ceph_queue_cap_snap(ci);
 				spin_lock(&realm->inodes_with_caps_lock);
 			}
 			spin_unlock(&realm->inodes_with_caps_lock);
@@ -818,11 +820,12 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			 * queued (again) by ceph_update_snap_trace()
 			 * below.  Queue it _now_, under the old context.
 			 */
+			spin_lock(&realm->inodes_with_caps_lock);
 			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&realm->inodes_with_caps_lock);
 			spin_unlock(&inode->i_lock);
 
-			ceph_queue_cap_snap(ci,
-					    ci->i_snap_realm->cached_context);
+			ceph_queue_cap_snap(ci);
 
 			iput(inode);
 			continue;
@@ -866,16 +869,20 @@ skip_inode:
 				continue;
 			ci = ceph_inode(inode);
 			spin_lock(&inode->i_lock);
-			if (!ci->i_snap_realm)
-				goto split_skip_inode;
-			ceph_put_snap_realm(mdsc, ci->i_snap_realm);
-			spin_lock(&realm->inodes_with_caps_lock);
-			list_add(&ci->i_snap_realm_item,
-				 &realm->inodes_with_caps);
-			ci->i_snap_realm = realm;
-			spin_unlock(&realm->inodes_with_caps_lock);
-			ceph_get_snap_realm(mdsc, realm);
-split_skip_inode:
+			if (list_empty(&ci->i_snap_realm_item)) {
+				struct ceph_snap_realm *oldrealm =
+					ci->i_snap_realm;
+
+				dout(" moving %p to split realm %llx %p\n",
+				     inode, realm->ino, realm);
+				spin_lock(&realm->inodes_with_caps_lock);
+				list_add(&ci->i_snap_realm_item,
+					 &realm->inodes_with_caps);
+				ci->i_snap_realm = realm;
+				spin_unlock(&realm->inodes_with_caps_lock);
+				ceph_get_snap_realm(mdsc, realm);
+				ceph_put_snap_realm(mdsc, oldrealm);
+			}
 			spin_unlock(&inode->i_lock);
 			iput(inode);
 		}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4290a6e860b..f888cf487b7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -11,6 +11,7 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/version.h>
@@ -995,9 +996,10 @@ static int __init init_ceph(void)
 	if (ret)
 		goto out_icache;
 
-	pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
-		CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
-		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+	pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
+		CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
+		CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+		CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
 	return 0;
 
 out_icache:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 65d12036b67..13513b80d87 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,8 +10,10 @@
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
+#include <linux/slab.h>
 
 #include "types.h"
 #include "messenger.h"
@@ -714,8 +716,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
 			     struct ceph_mds_session *session,
 			     struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
-				struct ceph_snap_context *snapc);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 				  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 37d6ce64569..2845422907f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -3,6 +3,7 @@
 #include "decode.h"
 
 #include <linux/xattr.h>
+#include <linux/slab.h>
 
 static bool ceph_is_valid_xattr(const char *name)
 {