From 114fc47492e23d93653e4a16664833e98d62a563 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Wed, 11 Jan 2012 17:41:01 -0800
Subject: ceph: change "ceph.layout" xattr to be "ceph.file.layout"

The virtual extended attribute named "ceph.layout" is meaningful
only for regular files.  Change its name to be "ceph.file.layout" to
more directly reflect that in the ceph xattr namespace.  Preserve
the old "ceph.layout" name for the time being (until we decide it's
safe to get rid of it entirely).

Add a missing initializer for "readonly" in the terminating entry.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a..9e6734e38c1 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+	{ true, "ceph.file.layout", ceph_vxattrcb_layout},
+	/* The following extended attribute name is deprecated */
 	{ true, "ceph.layout", ceph_vxattrcb_layout},
-	{ NULL, NULL }
+	{ true, NULL, NULL }
 };
 
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
-- 
cgit v1.2.3-70-g09d2


From ab434b60ab07f8c44246b6fb0cddee436687a09a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 13 Jan 2012 22:22:03 -0800
Subject: ceph: initialize client debugfs outside of monc->mutex

Initializing debufs under monc->mutex introduces a lock dependency for
sb->s_type->i_mutex_key, which (combined with several other dependencies)
leads to an annoying lockdep warning.  There's no particular reason to do
the debugfs setup under this lock, so move it out.

It used to be the case that our first monmap could come from the OSD; that
is no longer the case with recent servers, so we will reliably set up the
client entry during the initial authentication.

We don't have to worry about racing with debugfs teardown by
ceph_debugfs_client_cleanup() because ceph_destroy_client() calls
ceph_msgr_flush() first, which will wait for the message dispatch work
to complete (and the debugfs init to complete).

Fixes: #1940
Signed-off-by: Sage Weil <sage@newdream.net>
---
 net/ceph/ceph_common.c |  2 --
 net/ceph/mon_client.c  | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 97f70e50ad3..761ad9d6cc3 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -85,8 +85,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 	} else {
 		pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
 		memcpy(&client->fsid, fsid, sizeof(*fsid));
-		ceph_debugfs_client_init(client);
-		client->have_fsid = true;
 	}
 	return 0;
 }
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 0b62deae42b..1845cde2622 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -8,8 +8,8 @@
 
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/libceph.h>
+#include <linux/ceph/debugfs.h>
 #include <linux/ceph/decode.h>
-
 #include <linux/ceph/auth.h>
 
 /*
@@ -340,8 +340,19 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	client->monc.monmap = monmap;
 	kfree(old);
 
+	if (!client->have_fsid) {
+		client->have_fsid = true;
+		mutex_unlock(&monc->mutex);
+		/*
+		 * do debugfs initialization without mutex to avoid
+		 * creating a locking dependency
+		 */
+		ceph_debugfs_client_init(client);
+		goto out_unlocked;
+	}
 out:
 	mutex_unlock(&monc->mutex);
+out_unlocked:
 	wake_up_all(&client->auth_wq);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 32852a81bccd9e3d1953b894966393d1b546576d Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Sat, 14 Jan 2012 22:20:59 -0500
Subject: ceph: fix length validation in parse_reply_info()

"len" is read from network and thus needs validation.  Otherwise, given
a bogus "len" value, p+len could be an out-of-bounds pointer, which is
used in further parsing.

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb4..be1415fcaac 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* trace */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_trace(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
 	/* extra */
 	ceph_decode_32_safe(&p, end, len, bad);
 	if (len > 0) {
+		ceph_decode_need(&p, end, len, bad);
 		err = parse_reply_info_extra(&p, p+len, info, features);
 		if (err < 0)
 			goto out_bad;
-- 
cgit v1.2.3-70-g09d2


From d8fb02abdc39f92a1066313e2b17047876afa8f9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Thu, 12 Jan 2012 17:48:10 -0800
Subject: ceph: create a new session lock to avoid lock inversion

Lockdep was reporting a possible circular lock dependency in
dentry_lease_is_valid().  That function needs to sample the
session's s_cap_gen and and s_cap_ttl fields coherently, but needs
to do so while holding a dentry lock.  The s_cap_lock field was
being used to protect the two fields, but that can't be taken while
holding a lock on a dentry within the session.

In most cases, the s_cap_gen and s_cap_ttl fields only get operated
on separately.  But in three cases they need to be updated together.
Implement a new lock to protect the spots updating both fields
atomically is required.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Reviewed-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c       | 4 ++--
 fs/ceph/dir.c        | 4 ++--
 fs/ceph/mds_client.c | 8 +++++---
 fs/ceph/mds_client.h | 7 +++++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8b53193e4f7..90d789df9ce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
 	unsigned long ttl;
 	u32 gen;
 
-	spin_lock(&cap->session->s_cap_lock);
+	spin_lock(&cap->session->s_gen_ttl_lock);
 	gen = cap->session->s_cap_gen;
 	ttl = cap->session->s_cap_ttl;
-	spin_unlock(&cap->session->s_cap_lock);
+	spin_unlock(&cap->session->s_gen_ttl_lock);
 
 	if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
 		dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 98954003a8d..63c52f33361 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -975,10 +975,10 @@ static int dentry_lease_is_valid(struct dentry *dentry)
 	di = ceph_dentry(dentry);
 	if (di && di->lease_session) {
 		s = di->lease_session;
-		spin_lock(&s->s_cap_lock);
+		spin_lock(&s->s_gen_ttl_lock);
 		gen = s->s_cap_gen;
 		ttl = s->s_cap_ttl;
-		spin_unlock(&s->s_cap_lock);
+		spin_unlock(&s->s_gen_ttl_lock);
 
 		if (di->lease_gen == gen &&
 		    time_before(jiffies, dentry->d_time) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index be1415fcaac..a4fdf9397a9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -400,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
 	s->s_con.peer_name.num = cpu_to_le64(mds);
 
-	spin_lock_init(&s->s_cap_lock);
+	spin_lock_init(&s->s_gen_ttl_lock);
 	s->s_cap_gen = 0;
 	s->s_cap_ttl = 0;
+
+	spin_lock_init(&s->s_cap_lock);
 	s->s_renew_requested = 0;
 	s->s_renew_seq = 0;
 	INIT_LIST_HEAD(&s->s_caps);
@@ -2328,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
 	case CEPH_SESSION_STALE:
 		pr_info("mds%d caps went stale, renewing\n",
 			session->s_mds);
-		spin_lock(&session->s_cap_lock);
+		spin_lock(&session->s_gen_ttl_lock);
 		session->s_cap_gen++;
 		session->s_cap_ttl = 0;
-		spin_unlock(&session->s_cap_lock);
+		spin_unlock(&session->s_gen_ttl_lock);
 		send_renew_caps(mdsc, session);
 		break;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e3947..8c7c04ebb59 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
 	void             *s_authorizer_buf, *s_authorizer_reply_buf;
 	size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
 
-	/* protected by s_cap_lock */
-	spinlock_t        s_cap_lock;
+	/* protected by s_gen_ttl_lock */
+	spinlock_t        s_gen_ttl_lock;
 	u32               s_cap_gen;  /* inc each time we get mds stale msg */
 	unsigned long     s_cap_ttl;  /* when session caps expire */
+
+	/* protected by s_cap_lock */
+	spinlock_t        s_cap_lock;
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
-- 
cgit v1.2.3-70-g09d2


From 97bb59a03dd6767fcc00be09b0c6d9e5294eeea6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Tue, 24 Jan 2012 10:08:36 -0600
Subject: rbd: fix a memory leak in rbd_get_client()

If an existing rbd client is found to be suitable for use in
rbd_get_client(), the rbd_options structure is not being
freed as it should.  Fix that.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 148ab944378..7d8f8ddb335 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -380,6 +380,7 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
 	rbdc = __rbd_client_find(opt);
 	if (rbdc) {
 		ceph_destroy_options(opt);
+		kfree(rbd_opts);
 
 		/* using an existing client */
 		kref_get(&rbdc->kref);
-- 
cgit v1.2.3-70-g09d2


From d23a4b3fd6ef70b80411b39b8c8bc548a219ce70 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@dreamhost.com>
Date: Sun, 29 Jan 2012 13:57:43 -0600
Subject: rbd: fix safety of rbd_put_client()

The rbd_client structure uses a kref to arrange for cleaning up and
freeing an instance when its last reference is dropped.  The cleanup
routine is rbd_client_release(), and one of the things it does is
delete the rbd_client from rbd_client_list.  It acquires node_lock
to do so, but the way it is done is still not safe.

The problem is that when attempting to reuse an existing rbd_client,
the structure found might already be in the process of getting
destroyed and cleaned up.

Here's the scenario, with "CLIENT" representing an existing
rbd_client that's involved in the race:

 Thread on CPU A                | Thread on CPU B
 ---------------                | ---------------
 rbd_put_client(CLIENT)         | rbd_get_client()
   kref_put()                   |   (acquires node_lock)
     kref->refcount becomes 0   |   __rbd_client_find() returns CLIENT
     calls rbd_client_release() |   kref_get(&CLIENT->kref);
                                |   (releases node_lock)
       (acquires node_lock)     |
       deletes CLIENT from list | ...and starts using CLIENT...
       (releases node_lock)     |
       and frees CLIENT         | <-- but CLIENT gets freed here

Fix this by having rbd_put_client() acquire node_lock.  The result
could still be improved, but at least it avoids this problem.

Signed-off-by: Alex Elder <elder@dreamhost.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 drivers/block/rbd.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7d8f8ddb335..7f40cb4553c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -407,15 +407,15 @@ done_err:
 
 /*
  * Destroy ceph client
+ *
+ * Caller must hold node_lock.
  */
 static void rbd_client_release(struct kref *kref)
 {
 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 
 	dout("rbd_release_client %p\n", rbdc);
-	spin_lock(&node_lock);
 	list_del(&rbdc->node);
-	spin_unlock(&node_lock);
 
 	ceph_destroy_client(rbdc->client);
 	kfree(rbdc->rbd_opts);
@@ -428,7 +428,9 @@ static void rbd_client_release(struct kref *kref)
  */
 static void rbd_put_client(struct rbd_device *rbd_dev)
 {
+	spin_lock(&node_lock);
 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+	spin_unlock(&node_lock);
 	rbd_dev->rbd_client = NULL;
 	rbd_dev->client = NULL;
 }
-- 
cgit v1.2.3-70-g09d2