67 files changed, 7974 insertions, 1318 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f54a157a029..9e9d70c02a0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,17 +220,16 @@ config JBD
 	tristate
 	help
 	  This is a generic journalling layer for block devices.  It is
-	  currently used by the ext3 and OCFS2 file systems, but it could
-	  also be used to add journal support to other file systems or block
+	  currently used by the ext3 file system, but it could also be
+	  used to add journal support to other file systems or block
 	  devices such as RAID or LVM.
 
-	  If you are using the ext3 or OCFS2 file systems, you need to
-	  say Y here. If you are not using ext3 OCFS2 then you will probably
-	  want to say N.
+	  If you are using the ext3 file system, you need to say Y here.
+	  If you are not using ext3 then you will probably want to say N.
 
 	  To compile this device as a module, choose M here: the module will be
-	  called jbd.  If you are compiling ext3 or OCFS2 into the kernel,
-	  you cannot compile this code as a module.
+	  called jbd.  If you are compiling ext3 into the kernel, you
+	  cannot compile this code as a module.
 
 config JBD_DEBUG
 	bool "JBD (ext3) debugging support"
@@ -254,15 +253,16 @@ config JBD2
 	help
 	  This is a generic journaling layer for block devices that support
 	  both 32-bit and 64-bit block numbers.  It is currently used by
-	  the ext4 filesystem, but it could also be used to add
+	  the ext4 and OCFS2 filesystems, but it could also be used to add
 	  journal support to other file systems or block devices such
 	  as RAID or LVM.
 
-	  If you are using ext4, you need to say Y here. If you are not
-	  using ext4 then you will probably want to say N.
+	  If you are using ext4 or OCFS2, you need to say Y here.
+	  If you are not using ext4 or OCFS2 then you will
+	  probably want to say N.
 
 	  To compile this device as a module, choose M here. The module will be
-	  called jbd2.  If you are compiling ext4 into the kernel,
+	  called jbd2.  If you are compiling ext4 or OCFS2 into the kernel,
 	  you cannot compile this code as a module.
 
 config JBD2_DEBUG
@@ -433,6 +433,14 @@ config FS_POSIX_ACL
 	bool
 	default n
 
+config FILE_LOCKING
+	bool "Enable POSIX file locking API" if EMBEDDED
+	default y
+	help
+	  This option enables standard file locking support, required
+          for filesystems like NFS and for the flock() system
+          call. Disabling this option saves about 11k.
+
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 
@@ -440,7 +448,7 @@ config OCFS2_FS
 	tristate "OCFS2 file system support"
 	depends on NET && SYSFS
 	select CONFIGFS_FS
-	select JBD
+	select JBD2
 	select CRC32
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
@@ -511,6 +519,16 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 
+config OCFS2_COMPAT_JBD
+	bool "Use JBD for compatibility"
+	depends on OCFS2_FS
+	default n
+	select JBD
+	help
+	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+	  is backwards compatible with JBD.  It is safe to say N here.
+	  However, if you really want to use the original JBD, say Y here.
+
 endif # BLOCK
 
 config DNOTIFY
@@ -1779,6 +1797,28 @@ config SUNRPC_XPRT_RDMA
 
 	  If unsure, say N.
 
+config SUNRPC_REGISTER_V4
+	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
+	depends on SUNRPC && EXPERIMENTAL
+	default n
+	help
+	  Sun added support for registering RPC services at an IPv6
+	  address by creating two new versions of the rpcbind protocol
+	  (RFC 1833).
+
+	  This option enables support in the kernel RPC server for
+	  registering kernel RPC services via version 4 of the rpcbind
+	  protocol.  If you enable this option, you must run a portmapper
+	  daemon that supports rpcbind protocol version 4.
+
+	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+	  requires that you enable this option and use a portmapper that
+	  supports rpcbind version 4.
+
+	  If unsure, say N to get traditional behavior (register kernel
+	  RPC services using only rpcbind version 2).  Distributions
+	  using the legacy Linux portmapper daemon must say N here.
+
 config RPCSEC_GSS_KRB5
 	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
 	depends on SUNRPC && EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index de404b00eb0..b6f27dc26b7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,7 +7,7 @@
 
 obj-y :=	open.o read_write.o file_table.o super.o \
 		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
-		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
+		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
+obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 
 nfsd-$(CONFIG_NFSD)		:= nfsctl.o
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a55..97f6073ab33 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
 obj-$(CONFIG_LOCKD) += lockd.o
 
 lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
-	        svcproc.o svcsubs.o mon.o xdr.o
+	        svcproc.o svcsubs.o mon.o xdr.o grace.o
 lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
 lockd-objs		      := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bf..8307dd64bf4 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 	u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
 	int status;
 
-	status = lockd_up(nlm_init->protocol);
+	status = lockd_up();
 	if (status < 0)
 		return ERR_PTR(status);
 
-	host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+	host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
 				   nlm_init->protocol, nlm_version,
-				   nlm_init->hostname,
-				   strlen(nlm_init->hostname));
+				   nlm_init->hostname);
 	if (host == NULL) {
 		lockd_down();
 		return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 {
 	const struct file_lock *fl = &lock->fl;
 	const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
 	/* This one ensures that our parent doesn't terminate while the
 	 * reclaim is in progress */
 	lock_kernel();
-	lockd_up(0); /* note: this cannot fail as lockd is already running */
+	lockd_up();	/* note: this cannot fail as lockd is already running */
 
 	dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
 
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 00000000000..183cc1f0af1
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out.  Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_add(&lm->list, &grace_list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking.  The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+	spin_lock(&grace_lock);
+	list_del_init(&lm->list);
+	spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+	return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eac..9fd8889097b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/in.h>
+#include <linux/in6.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 
+#include <net/ipv6.h>
 
 #define NLMDBG_FACILITY		NLMDBG_HOSTCACHE
 #define NLM_HOST_NRHASH		32
-#define NLM_ADDRHASH(addr)	(ntohl(addr) & (NLM_HOST_NRHASH-1))
 #define NLM_HOST_REBIND		(60 * HZ)
 #define NLM_HOST_EXPIRE		(300 * HZ)
 #define NLM_HOST_COLLECT	(120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long		next_gc;
 static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
-
 static void			nlm_gc_hosts(void);
-static struct nsm_handle *	__nsm_find(const struct sockaddr_in *,
-					const char *, unsigned int, int);
-static struct nsm_handle *	nsm_find(const struct sockaddr_in *sin,
-					 const char *hostname,
-					 unsigned int hostname_len);
+static struct nsm_handle	*nsm_find(const struct sockaddr *sap,
+						const size_t salen,
+						const char *hostname,
+						const size_t hostname_len,
+						const int create);
+
+struct nlm_lookup_host_info {
+	const int		server;		/* search for server|client */
+	const struct sockaddr	*sap;		/* address to search for */
+	const size_t		salen;		/* it's length */
+	const unsigned short	protocol;	/* transport to search for*/
+	const u32		version;	/* NLM version to search for */
+	const char		*hostname;	/* remote's hostname */
+	const size_t		hostname_len;	/* it's length */
+	const struct sockaddr	*src_sap;	/* our address (optional) */
+	const size_t		src_len;	/* it's length */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+	unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+	return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	const struct in6_addr addr = sin6->sin6_addr;
+	return __nlm_hash32(addr.s6_addr32[0]) ^
+	       __nlm_hash32(addr.s6_addr32[1]) ^
+	       __nlm_hash32(addr.s6_addr32[2]) ^
+	       __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+	unsigned int hash;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		hash = __nlm_hash_addr4(sap);
+		break;
+	case AF_INET6:
+		hash = __nlm_hash_addr6(sap);
+		break;
+	default:
+		hash = 0;
+	}
+	return hash & (NLM_HOST_NRHASH - 1);
+}
+
+static void nlm_clear_port(struct sockaddr *sap)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *)sap)->sin_port = 0;
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *)sap)->sin6_port = 0;
+		break;
+	}
+}
+
+static void nlm_display_address(const struct sockaddr *sap,
+				char *buf, const size_t len)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+	switch (sap->sa_family) {
+	case AF_UNSPEC:
+		snprintf(buf, len, "unspecified");
+		break;
+	case AF_INET:
+		snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+		break;
+	case AF_INET6:
+		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+			snprintf(buf, len, NIPQUAD_FMT,
+				 NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+		else
+			snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+		break;
+	default:
+		snprintf(buf, len, "unsupported address family");
+		break;
+	}
+}
 
 /*
  * Common host lookup routine for server & client
  */
-static struct nlm_host *nlm_lookup_host(int server,
-					const struct sockaddr_in *sin,
-					int proto, u32 version,
-					const char *hostname,
-					unsigned int hostname_len,
-					const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 {
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nlm_host	*host;
 	struct nsm_handle *nsm = NULL;
-	int		hash;
-
-	dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
-			", p=%d, v=%u, my role=%s, name=%.*s)\n",
-			NIPQUAD(ssin->sin_addr.s_addr),
-			NIPQUAD(sin->sin_addr.s_addr), proto, version,
-			server? "server" : "client",
-			hostname_len,
-			hostname? hostname : "<none>");
 
-
-	hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-
-	/* Lock hash table */
 	mutex_lock(&nlm_host_mutex);
 
 	if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
 	 * different NLM rpc_clients into one single nlm_host object.
 	 * This would allow us to have one nlm_host per address.
 	 */
-	chain = &nlm_hosts[hash];
+	chain = &nlm_hosts[nlm_hash_address(ni->sap)];
 	hlist_for_each_entry(host, pos, chain, h_hash) {
-		if (!nlm_cmp_addr(&host->h_addr, sin))
+		if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
 			continue;
 
 		/* See if we have an NSM handle for this client */
 		if (!nsm)
 			nsm = host->h_nsmhandle;
 
-		if (host->h_proto != proto)
+		if (host->h_proto != ni->protocol)
 			continue;
-		if (host->h_version != version)
+		if (host->h_version != ni->version)
 			continue;
-		if (host->h_server != server)
+		if (host->h_server != ni->server)
 			continue;
-		if (!nlm_cmp_addr(&host->h_saddr, ssin))
+		if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
 			continue;
 
 		/* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
 		hlist_add_head(&host->h_hash, chain);
 
 		nlm_get_host(host);
+		dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+				host->h_name, host->h_addrbuf);
 		goto out;
 	}
-	if (nsm)
-		atomic_inc(&nsm->sm_count);
-
-	host = NULL;
 
-	/* Sadly, the host isn't in our hash table yet. See if
-	 * we have an NSM handle for it. If not, create one.
+	/*
+	 * The host wasn't in our hash table.  If we don't
+	 * have an NSM handle for it yet, create one.
 	 */
-	if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
-		goto out;
+	if (nsm)
+		atomic_inc(&nsm->sm_count);
+	else {
+		host = NULL;
+		nsm = nsm_find(ni->sap, ni->salen,
+				ni->hostname, ni->hostname_len, 1);
+		if (!nsm) {
+			dprintk("lockd: nlm_lookup_host failed; "
+				"no nsm handle\n");
+			goto out;
+		}
+	}
 
 	host = kzalloc(sizeof(*host), GFP_KERNEL);
 	if (!host) {
 		nsm_release(nsm);
+		dprintk("lockd: nlm_lookup_host failed; no memory\n");
 		goto out;
 	}
 	host->h_name	   = nsm->sm_name;
-	host->h_addr       = *sin;
-	host->h_addr.sin_port = 0;	/* ouch! */
-	host->h_saddr	   = *ssin;
-	host->h_version    = version;
-	host->h_proto      = proto;
+	memcpy(nlm_addr(host), ni->sap, ni->salen);
+	host->h_addrlen = ni->salen;
+	nlm_clear_port(nlm_addr(host));
+	memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+	host->h_version    = ni->version;
+	host->h_proto      = ni->protocol;
 	host->h_rpcclnt    = NULL;
 	mutex_init(&host->h_mutex);
 	host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
 	host->h_nsmhandle  = nsm;
-	host->h_server	   = server;
+	host->h_server	   = ni->server;
 	hlist_add_head(&host->h_hash, chain);
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
 	INIT_LIST_HEAD(&host->h_reclaim);
 
 	nrhosts++;
+
+	nlm_display_address((struct sockaddr *)&host->h_addr,
+				host->h_addrbuf, sizeof(host->h_addrbuf));
+	nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+				host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+
+	dprintk("lockd: nlm_lookup_host created host %s\n",
+			host->h_name);
+
 out:
 	mutex_unlock(&nlm_host_mutex);
 	return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
 	kfree(host);
 }
 
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
  */
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
-				     int proto, u32 version,
-				     const char *hostname,
-				     unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+				     const size_t salen,
+				     const unsigned short protocol,
+				     const u32 version, const char *hostname)
 {
-	struct sockaddr_in ssin = {0};
-
-	return nlm_lookup_host(0, sin, proto, version,
-			       hostname, hostname_len, &ssin);
+	const struct sockaddr source = {
+		.sa_family	= AF_UNSPEC,
+	};
+	struct nlm_lookup_host_info ni = {
+		.server		= 0,
+		.sap		= sap,
+		.salen		= salen,
+		.protocol	= protocol,
+		.version	= version,
+		.hostname	= hostname,
+		.hostname_len	= strlen(hostname),
+		.src_sap	= &source,
+		.src_len	= sizeof(source),
+	};
+
+	dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+			(hostname ? hostname : "<none>"), version,
+			(protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+	return nlm_lookup_host(&ni);
 }
 
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request.  If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses.  The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
  */
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
-			const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+				    const char *hostname,
+				    const size_t hostname_len)
 {
-	struct sockaddr_in ssin = {0};
+	struct sockaddr_in sin = {
+		.sin_family	= AF_INET,
+	};
+	struct sockaddr_in6 sin6 = {
+		.sin6_family	= AF_INET6,
+	};
+	struct nlm_lookup_host_info ni = {
+		.server		= 1,
+		.sap		= svc_addr(rqstp),
+		.salen		= rqstp->rq_addrlen,
+		.protocol	= rqstp->rq_prot,
+		.version	= rqstp->rq_vers,
+		.hostname	= hostname,
+		.hostname_len	= hostname_len,
+		.src_len	= rqstp->rq_addrlen,
+	};
+
+	dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+			(int)hostname_len, hostname, rqstp->rq_vers,
+			(rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+	switch (ni.sap->sa_family) {
+	case AF_INET:
+		sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+		ni.src_sap = (struct sockaddr *)&sin;
+		break;
+	case AF_INET6:
+		ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+		ni.src_sap = (struct sockaddr *)&sin6;
+		break;
+	default:
+		return NULL;
+	}
 
-	ssin.sin_addr = rqstp->rq_daddr.addr;
-	return nlm_lookup_host(1, svc_addr_in(rqstp),
-			       rqstp->rq_prot, rqstp->rq_vers,
-			       hostname, hostname_len, &ssin);
+	return nlm_lookup_host(&ni);
 }
 
 /*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
-	dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
-			NIPQUAD(host->h_saddr.sin_addr),
-			NIPQUAD(host->h_addr.sin_addr));
+	dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+			host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
 
 	/* Lock host handle */
 	mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
 		if (time_after_eq(jiffies, host->h_nextrebind)) {
 			rpc_force_rebind(clnt);
 			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
-			dprintk("lockd: next rebind in %ld jiffies\n",
+			dprintk("lockd: next rebind in %lu jiffies\n",
 					host->h_nextrebind - jiffies);
 		}
 	} else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
 		};
 		struct rpc_create_args args = {
 			.protocol	= host->h_proto,
-			.address	= (struct sockaddr *)&host->h_addr,
-			.addrsize	= sizeof(host->h_addr),
-			.saddress	= (struct sockaddr *)&host->h_saddr,
+			.address	= nlm_addr(host),
+			.addrsize	= host->h_addrlen,
+			.saddress	= nlm_srcaddr(host),
 			.timeout	= &timeparms,
 			.servername	= host->h_name,
 			.program	= &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
-			hostname, NIPQUAD(sin->sin_addr));
-
-	/* Find the NSM handle for this peer */
-	if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+	nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+			hostname, hostname_len, 0);
+	if (nsm == NULL) {
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				hostname_len, hostname);
 		return;
+	}
+
+	dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+			hostname_len, hostname, nsm->sm_addrbuf);
 
 	/* When reclaiming locks on this peer, make sure that
 	 * we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
 static LIST_HEAD(nsm_handles);
 static DEFINE_SPINLOCK(nsm_lock);
 
-static struct nsm_handle *
-__nsm_find(const struct sockaddr_in *sin,
-		const char *hostname, unsigned int hostname_len,
-		int create)
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+				   const size_t salen,
+				   const char *hostname,
+				   const size_t hostname_len,
+				   const int create)
 {
 	struct nsm_handle *nsm = NULL;
 	struct nsm_handle *pos;
 
-	if (!sin)
+	if (!sap)
 		return NULL;
 
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
 			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
 					    "in NFS lock request\n",
-				hostname_len, hostname);
+				(int)hostname_len, hostname);
 		}
 		return NULL;
 	}
@@ -489,7 +657,7 @@ retry:
 			if (strlen(pos->sm_name) != hostname_len
 			 || memcmp(pos->sm_name, hostname, hostname_len))
 				continue;
-		} else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
 			continue;
 		atomic_inc(&pos->sm_count);
 		kfree(nsm);
@@ -509,10 +677,13 @@ retry:
 	if (nsm == NULL)
 		return NULL;
 
-	nsm->sm_addr = *sin;
+	memcpy(nsm_addr(nsm), sap, salen);
+	nsm->sm_addrlen = salen;
 	nsm->sm_name = (char *) (nsm + 1);
 	memcpy(nsm->sm_name, hostname, hostname_len);
 	nsm->sm_name[hostname_len] = '\0';
+	nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
 	atomic_set(&nsm->sm_count, 1);
 	goto retry;
 
@@ -521,13 +692,6 @@ found:
 	return nsm;
 }
 
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
-	 unsigned int hostname_len)
-{
-	return __nsm_find(sin, hostname, hostname_len, 1);
-}
-
 /*
  * Release an NSM handle
  */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b1..4e7e958e8f6 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 
 	memset(&args, 0, sizeof(args));
 	args.mon_name = nsm->sm_name;
-	args.addr = nsm->sm_addr.sin_addr.s_addr;
+	args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
 	args.prog = NLM_PROGRAM;
 	args.vers = 3;
 	args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5bd9bf0fa9d..c631a83931c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int		nlmsvc_users;
 static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
-int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
 /*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
 		return nlm_timeout * 5 * HZ;
 }
 
-unsigned long get_nfs_grace_period(void)
-{
-	unsigned long lockdgrace = get_lockd_grace_period();
-	unsigned long nfsdgrace = 0;
-
-	if (nlmsvc_ops)
-		nfsdgrace = nlmsvc_ops->get_grace_period();
-
-	return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
+static struct lock_manager lockd_manager = {
+};
 
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
 {
-	nlmsvc_grace_period = 1;
-	return get_nfs_grace_period() + jiffies;
+	locks_end_grace(&lockd_manager);
 }
 
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
 {
-	nlmsvc_grace_period = 0;
+	unsigned long grace_period = get_lockd_grace_period();
+
+	locks_start_grace(&lockd_manager);
+	cancel_delayed_work_sync(&grace_period_end);
+	schedule_delayed_work(&grace_period_end, grace_period);
 }
 
 /*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
 {
 	int		err = 0, preverr = 0;
 	struct svc_rqst *rqstp = vrqstp;
-	unsigned long grace_period_expire;
 
 	/* try_to_freeze() is called from svc_recv() */
 	set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
 		nlm_timeout = LOCKD_DFLT_TIMEO;
 	nlmsvc_timeout = nlm_timeout * HZ;
 
-	grace_period_expire = set_grace_period();
+	set_grace_period();
 
 	/*
 	 * The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
 			flush_signals(current);
 			if (nlmsvc_ops) {
 				nlmsvc_invalidate_all();
-				grace_period_expire = set_grace_period();
+				set_grace_period();
 			}
 			continue;
 		}
 
-		/*
-		 * Retry any blocked locks that have been notified by
-		 * the VFS. Don't do this during grace period.
-		 * (Theoretically, there shouldn't even be blocked locks
-		 * during grace period).
-		 */
-		if (!nlmsvc_grace_period) {
-			timeout = nlmsvc_retry_blocked();
-		} else if (time_before(grace_period_expire, jiffies))
-			clear_grace_period();
+		timeout = nlmsvc_retry_blocked();
 
 		/*
 		 * Find a socket with data available and call its
@@ -195,6 +180,7 @@ lockd(void *vrqstp)
 		svc_process(rqstp);
 	}
 	flush_signals(current);
+	cancel_delayed_work_sync(&grace_period_end);
 	if (nlmsvc_ops)
 		nlmsvc_invalidate_all();
 	nlm_shutdown_hosts();
@@ -203,25 +189,28 @@ lockd(void *vrqstp)
 }
 
 /*
- * Make any sockets that are needed but not present.
- * If nlm_udpport or nlm_tcpport were set as module
- * options, make those sockets unconditionally
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
  */
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
 {
 	static int warned;
 	struct svc_xprt *xprt;
 	int err = 0;
 
-	if (proto == IPPROTO_UDP || nlm_udpport) {
-		xprt = svc_find_xprt(serv, "udp", 0, 0);
-		if (!xprt)
-			err = svc_create_xprt(serv, "udp", nlm_udpport,
-					      SVC_SOCK_DEFAULTS);
-		else
-			svc_xprt_put(xprt);
-	}
-	if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+	xprt = svc_find_xprt(serv, "udp", 0, 0);
+	if (!xprt)
+		err = svc_create_xprt(serv, "udp", nlm_udpport,
+				      SVC_SOCK_DEFAULTS);
+	else
+		svc_xprt_put(xprt);
+	if (err >= 0) {
 		xprt = svc_find_xprt(serv, "tcp", 0, 0);
 		if (!xprt)
 			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto)
 /*
  * Bring up the lockd process if it's not already up.
  */
-int
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
+int lockd_up(void)
 {
 	struct svc_serv *serv;
 	int		error = 0;
@@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (nlmsvc_rqst) {
-		if (proto)
-			error = make_socks(nlmsvc_rqst->rq_server, proto);
+	if (nlmsvc_rqst)
 		goto out;
-	}
 
 	/*
 	 * Sanity check: if there's no pid,
@@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
 	}
 
-	if ((error = make_socks(serv, proto)) < 0)
+	error = make_socks(serv);
+	if (error < 0)
 		goto destroy_and_out;
 
 	/*
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a714f64515..014f6ce4817 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	dprintk("lockd: TEST4        called\n");
 	resp->cookie = argp->cookie;
 
-	/* Don't accept test requests during grace period */
-	if (nlmsvc_grace_period) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	resp->cookie = argp->cookie;
 
-	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Now try to lock the file */
 	resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
-					argp->block, &argp->cookie);
+					argp->block, &argp->cookie,
+					argp->reclaim);
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
@@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
+	if (locks_in_grace() && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
 	struct sockaddr_in	saddr;
 
-	memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
 	dprintk("lockd: SM_NOTIFY     called\n");
-	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-	 || ntohs(saddr.sin_port) >= 1024) {
+
+	if (!nlm_privileged_requester(rqstp)) {
 		char buf[RPC_MAX_ADDRBUFLEN];
 		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
 				svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index cf0d5c2c318..6063a8e4b9f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
 __be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	    struct nlm_host *host, struct nlm_lock *lock, int wait,
-	    struct nlm_cookie *cookie)
+	    struct nlm_cookie *cookie, int reclaim)
 {
 	struct nlm_block	*block = NULL;
 	int			error;
@@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	if (locks_in_grace() && !reclaim) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+	if (reclaim && !locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
+
 	if (!wait)
 		lock->fl.fl_flags &= ~FL_SLEEP;
 	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
@@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
+	if (locks_in_grace()) {
+		ret = nlm_lck_denied_grace_period;
+		goto out;
+	}
 	error = vfs_test_lock(file->f_file, &lock->fl);
 	if (error == FILE_LOCK_DEFERRED) {
 		ret = nlmsvc_defer_lock_rqst(rqstp, block);
@@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
+	if (locks_in_grace())
+		return nlm_lck_denied_grace_period;
+
 	mutex_lock(&file->f_mutex);
 	block = nlmsvc_lookup_block(file, lock);
 	mutex_unlock(&file->f_mutex);
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 76262c1986f..548b0bb2b84 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	dprintk("lockd: TEST          called\n");
 	resp->cookie = argp->cookie;
 
-	/* Don't accept test requests during grace period */
-	if (nlmsvc_grace_period) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	resp->cookie = argp->cookie;
 
-	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
-		resp->status = nlm_lck_denied_grace_period;
-		return rc;
-	}
-
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Now try to lock the file */
 	resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
-					       argp->block, &argp->cookie));
+					       argp->block, &argp->cookie,
+					       argp->reclaim));
 	if (resp->status == nlm_drop_reply)
 		rc = rpc_drop_reply;
 	else
@@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	dprintk("lockd: GRANTED       called\n");
-	resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+	resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
 	dprintk("lockd: GRANTED       status %d\n", ntohl(resp->status));
 	return rpc_success;
 }
@@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept new lock requests during grace period */
-	if (nlmsvc_grace_period && !argp->reclaim) {
+	if (locks_in_grace() && !argp->reclaim) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 	resp->cookie = argp->cookie;
 
 	/* Don't accept requests during grace period */
-	if (nlmsvc_grace_period) {
+	if (locks_in_grace()) {
 		resp->status = nlm_lck_denied_grace_period;
 		return rpc_success;
 	}
@@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 {
 	struct sockaddr_in	saddr;
 
-	memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
 	dprintk("lockd: SM_NOTIFY     called\n");
-	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
-	 || ntohs(saddr.sin_port) >= 1024) {
+
+	if (!nlm_privileged_requester(rqstp)) {
 		char buf[RPC_MAX_ADDRBUFLEN];
 		printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
 				svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 198b4e55b37..34c2766e27c 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
 static int
 nlmsvc_match_ip(void *datap, struct nlm_host *host)
 {
-	return nlm_cmp_addr(&host->h_saddr, datap);
+	return nlm_cmp_addr(nlm_srcaddr(host), datap);
 }
 
 /**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc3..1f226290c67 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 	argp->state = ntohl(*p++);
 	/* Preserve the address in network byte order */
 	argp->addr = *p++;
-	argp->vers = *p++;
-	argp->proto = *p++;
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c..50c493a8ad8 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
 	argp->state = ntohl(*p++);
 	/* Preserve the address in network byte order */
 	argp->addr  = *p++;
-	argp->vers  = *p++;
-	argp->proto = *p++;
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index f447f4b4476..6a09760c596 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -105,7 +105,8 @@ int nfs_callback_up(void)
 	mutex_lock(&nfs_callback_mutex);
 	if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
 		goto out;
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+				AF_INET, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 15c6faeec77..b2786a5f9af 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -70,7 +70,6 @@ nlm_fclose(struct file *filp)
 static struct nlmsvc_binding	nfsd_nlm_ops = {
 	.fopen		= nlm_fopen,		/* open file for locking */
 	.fclose		= nlm_fclose,		/* close file */
-	.get_grace_period = get_nfs4_grace_period,
 };
 
 void
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 4d617ea28cf..9dbd2eb9128 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 		SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	if (nfserr)
 		RETURN_STATUS(nfserr);
 
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	dprintk("nfsd: FSSTAT(3)   %s\n",
 				SVCFH_fmt(&argp->fh));
 
-	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
 	fh_put(&argp->fh);
 	RETURN_STATUS(nfserr);
 }
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	resp->f_maxfilesize = ~(u32) 0;
 	resp->f_properties = NFS3_FSF_DEFAULT;
 
-	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
+	nfserr = fh_verify(rqstp, &argp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 
 	/* Check special features of the file system. May request
 	 * different read/write sizes for file systems known to have
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 702fa577aa6..094747a1227 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
-	WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+	WRITE32(cb_rec->cbr_stateid.si_generation);
+	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
 	WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,6 +380,7 @@ static int do_probe_callback(void *data)
 		.addrsize	= sizeof(addr),
 		.timeout	= &timeparms,
 		.program	= &cb_program,
+		.prognumber	= cb->cb_prog,
 		.version	= nfs_cb_version[1]->number,
 		.authflavor	= RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
 	addr.sin_port = htons(cb->cb_port);
 	addr.sin_addr.s_addr = htonl(cb->cb_addr);
 
-	/* Initialize rpc_stat */
-	memset(args.program->stats, 0, sizeof(struct rpc_stat));
-
 	/* Create RPC client */
 	client = rpc_create(&args);
 	if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index e5b51ffafc6..669461e291a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	/* Openowner is now set, so sequence id will get bumped.  Now we need
 	 * these checks before we do any creates: */
 	status = nfserr_grace;
-	if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+	if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 	status = nfserr_no_grace;
-	if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+	if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
 		goto out;
 
 	switch (open->op_claim_type) {
@@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	__be32 status;
 
-	if (nfs4_in_grace())
+	if (locks_in_grace())
 		return nfserr_grace;
 	status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
 			     remove->rm_name, remove->rm_namelen);
@@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (!cstate->save_fh.fh_dentry)
 		return status;
-	if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+	if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
 					& NFSEXP_NOSUBTREECHECK))
 		return nfserr_grace;
 	status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1578d7a2667..0cc7ff5d5ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
 static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
 static time_t boot_time;
-static int in_grace = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
@@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 		case NFS4_OPEN_CLAIM_NULL:
 			/* Let's not give out any delegations till everyone's
 			 * had the chance to reclaim theirs.... */
-			if (nfs4_in_grace())
+			if (locks_in_grace())
 				goto out;
 			if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
 				goto out;
@@ -1816,12 +1815,15 @@ out:
 	return status;
 }
 
+struct lock_manager nfsd4_manager = {
+};
+
 static void
-end_grace(void)
+nfsd4_end_grace(void)
 {
 	dprintk("NFSD: end of grace period\n");
 	nfsd4_recdir_purge_old();
-	in_grace = 0;
+	locks_end_grace(&nfsd4_manager);
 }
 
 static time_t
@@ -1838,8 +1840,8 @@ nfs4_laundromat(void)
 	nfs4_lock_state();
 
 	dprintk("NFSD: laundromat service - starting\n");
-	if (in_grace)
-		end_grace();
+	if (locks_in_grace())
+		nfsd4_end_grace();
 	list_for_each_safe(pos, next, &client_lru) {
 		clp = list_entry(pos, struct nfs4_client, cl_lru);
 		if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 		return nfserr_bad_stateid;
 	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
-	else if (nfs4_in_grace()) {
+	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
 		 * conflicting state; so we must wait out the grace period. */
 		return nfserr_grace;
@@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
-	return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
 		&& mandatory_lock(inode);
 }
 
@@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	filp = lock_stp->st_vfs_file;
 
 	status = nfserr_grace;
-	if (nfs4_in_grace() && !lock->lk_reclaim)
+	if (locks_in_grace() && !lock->lk_reclaim)
 		goto out;
 	status = nfserr_no_grace;
-	if (!nfs4_in_grace() && lock->lk_reclaim)
+	if (!locks_in_grace() && lock->lk_reclaim)
 		goto out;
 
 	locks_init_lock(&file_lock);
@@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int error;
 	__be32 status;
 
-	if (nfs4_in_grace())
+	if (locks_in_grace())
 		return nfserr_grace;
 
 	if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3192,9 +3194,9 @@ __nfs4_state_start(void)
 	unsigned long grace_time;
 
 	boot_time = get_seconds();
-	grace_time = get_nfs_grace_period();
+	grace_time = get_nfs4_grace_period();
 	lease_time = user_lease_time;
-	in_grace = 1;
+	locks_start_grace(&nfsd4_manager);
 	printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
 	       grace_time/HZ);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3213,12 +3215,6 @@ nfs4_state_start(void)
 	return;
 }
 
-int
-nfs4_in_grace(void)
-{
-	return in_grace;
-}
-
 time_t
 nfs4_lease_time(void)
 {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 14ba4d9b285..afcdf4b7684 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
 }
 
 static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+	DECODE_HEAD;
+
+	READ_BUF(sizeof(stateid_t));
+	READ32(sid->si_generation);
+	COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
 {
 	DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 	DECODE_HEAD;
 
 	close->cl_stateowner = NULL;
-	READ_BUF(4 + sizeof(stateid_t));
+	READ_BUF(4);
 	READ32(close->cl_seqid);
-	READ32(close->cl_stateid.si_generation);
-	COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
+	return nfsd4_decode_stateid(argp, &close->cl_stateid);
 
 	DECODE_TAIL;
 }
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 static inline __be32
 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
 {
-	DECODE_HEAD;
-
-	READ_BUF(sizeof(stateid_t));
-	READ32(dr->dr_stateid.si_generation);
-	COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
-	DECODE_TAIL;
+	return nfsd4_decode_stateid(argp, &dr->dr_stateid);
 }
 
 static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 	READ32(lock->lk_is_new);
 
 	if (lock->lk_is_new) {
-		READ_BUF(36);
+		READ_BUF(4);
 		READ32(lock->lk_new_open_seqid);
-		READ32(lock->lk_new_open_stateid.si_generation);
-
-		COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+		status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+		if (status)
+			return status;
+		READ_BUF(8 + sizeof(clientid_t));
 		READ32(lock->lk_new_lock_seqid);
 		COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
 		READ32(lock->lk_new_owner.len);
 		READ_BUF(lock->lk_new_owner.len);
 		READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
 	} else {
-		READ_BUF(20);
-		READ32(lock->lk_old_lock_stateid.si_generation);
-		COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+		status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
 		READ32(lock->lk_old_lock_seqid);
 	}
 
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 	DECODE_HEAD;
 
 	locku->lu_stateowner = NULL;
-	READ_BUF(24 + sizeof(stateid_t));
+	READ_BUF(8);
 	READ32(locku->lu_type);
 	if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
 		goto xdr_error;
 	READ32(locku->lu_seqid);
-	READ32(locku->lu_stateid.si_generation);
-	COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
 	READ64(locku->lu_offset);
 	READ64(locku->lu_length);
 
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		READ32(open->op_delegate_type);
 		break;
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-		READ_BUF(sizeof(stateid_t) + 4);
-		COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+		if (status)
+			return status;
+		READ_BUF(4);
 		READ32(open->op_fname.len);
 		READ_BUF(open->op_fname.len);
 		SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 	DECODE_HEAD;
 		    
 	open_conf->oc_stateowner = NULL;
-	READ_BUF(4 + sizeof(stateid_t));
-	READ32(open_conf->oc_req_stateid.si_generation);
-	COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+	if (status)
+		return status;
+	READ_BUF(4);
 	READ32(open_conf->oc_seqid);
 						        
 	DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 	DECODE_HEAD;
 		    
 	open_down->od_stateowner = NULL;
-	READ_BUF(12 + sizeof(stateid_t));
-	READ32(open_down->od_stateid.si_generation);
-	COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
 	READ32(open_down->od_seqid);
 	READ32(open_down->od_share_access);
 	READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
 	DECODE_HEAD;
 
-	READ_BUF(sizeof(stateid_t) + 12);
-	READ32(read->rd_stateid.si_generation);
-	COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+	if (status)
+		return status;
+	READ_BUF(12);
 	READ64(read->rd_offset);
 	READ32(read->rd_length);
 
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
-	DECODE_HEAD;
-
-	READ_BUF(sizeof(stateid_t));
-	READ32(setattr->sa_stateid.si_generation);
-	COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
-	if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
-		goto out;
+	__be32 status;
 
-	DECODE_TAIL;
+	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+	if (status)
+		return status;
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+				  &setattr->sa_iattr, &setattr->sa_acl);
 }
 
 static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 	int len;
 	DECODE_HEAD;
 
-	READ_BUF(sizeof(stateid_opaque_t) + 20);
-	READ32(write->wr_stateid.si_generation);
-	COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+	status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+	if (status)
+		return status;
+	READ_BUF(16);
 	READ64(write->wr_offset);
 	READ32(write->wr_stable_how);
 	if (write->wr_stable_how > 2)
@@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  * Header routine to setup seqid operation replay cache
  */
 #define ENCODE_SEQID_OP_HEAD					\
-	__be32 *p;						\
 	__be32 *save;						\
 								\
 	save = resp->p;
@@ -1950,6 +1962,17 @@ fail:
 	return -EINVAL;
 }
 
+static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+	ENCODE_HEAD;
+
+	RESERVE_SPACE(sizeof(stateid_t));
+	WRITE32(sid->si_generation);
+	WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+	ADJUST_ARGS();
+}
+
 static __be32
 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
@@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(close->cl_stateid.si_generation);
-		WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &close->cl_stateid);
+
 	ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
 	return nfserr;
 }
@@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(4 + sizeof(stateid_t));
-		WRITE32(lock->lk_resp_stateid.si_generation);
-		WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	} else if (nfserr == nfserr_denied)
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+	else if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lock->lk_denied);
 
 	ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
@@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
 {
 	ENCODE_SEQID_OP_HEAD;
 
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(locku->lu_stateid.si_generation);
-		WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
-				        
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
 	ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
 	return nfserr;
 }
@@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
 static __be32
 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
+	ENCODE_HEAD;
 	ENCODE_SEQID_OP_HEAD;
 
 	if (nfserr)
 		goto out;
 
-	RESERVE_SPACE(36 + sizeof(stateid_t));
-	WRITE32(open->op_stateid.si_generation);
-	WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
+	nfsd4_encode_stateid(resp, &open->op_stateid);
+	RESERVE_SPACE(40);
 	WRITECINFO(open->op_cinfo);
 	WRITE32(open->op_rflags);
 	WRITE32(2);
@@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 	case NFS4_OPEN_DELEGATE_NONE:
 		break;
 	case NFS4_OPEN_DELEGATE_READ:
-		RESERVE_SPACE(20 + sizeof(stateid_t));
-		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(20);
 		WRITE32(open->op_recall);
 
 		/*
@@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
 		ADJUST_ARGS();
 		break;
 	case NFS4_OPEN_DELEGATE_WRITE:
-		RESERVE_SPACE(32 + sizeof(stateid_t));
-		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+		nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+		RESERVE_SPACE(32);
 		WRITE32(0);
 
 		/*
@@ -2195,13 +2208,9 @@ static __be32
 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
 	ENCODE_SEQID_OP_HEAD;
-				        
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(oc->oc_resp_stateid.si_generation);
-		WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
 
 	ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
 	return nfserr;
@@ -2211,13 +2220,9 @@ static __be32
 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
 	ENCODE_SEQID_OP_HEAD;
-				        
-	if (!nfserr) {
-		RESERVE_SPACE(sizeof(stateid_t));
-		WRITE32(od->od_stateid.si_generation);
-		WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
-		ADJUST_ARGS();
-	}
+
+	if (!nfserr)
+		nfsd4_encode_stateid(resp, &od->od_stateid);
 
 	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
 	return nfserr;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index c53e65f8f3a..97543df5824 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 			return -EINVAL;
 		err = nfsd_create_serv();
 		if (!err) {
-			int proto = 0;
-			err = svc_addsock(nfsd_serv, fd, buf, &proto);
+			err = svc_addsock(nfsd_serv, fd, buf);
 			if (err >= 0) {
-				err = lockd_up(proto);
+				err = lockd_up();
 				if (err < 0)
 					svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
 			}
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ea37c96f044..cd25d91895a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 	if (error)
 		goto out;
 
-	if (!(access & NFSD_MAY_LOCK)) {
-		/*
-		 * pseudoflavor restrictions are not enforced on NLM,
-		 * which clients virtually always use auth_sys for,
-		 * even while using RPCSEC_GSS for NFS.
-		 */
-		error = check_nfsd_access(exp, rqstp);
-		if (error)
-			goto out;
-	}
+	/*
+	 * pseudoflavor restrictions are not enforced on NLM,
+	 * which clients virtually always use auth_sys for,
+	 * even while using RPCSEC_GSS for NFS.
+	 */
+	if (access & NFSD_MAY_LOCK)
+		goto skip_pseudoflavor_check;
+	/*
+	 * Clients may expect to be able to use auth_sys during mount,
+	 * even if they use gss for everything else; see section 2.3.2
+	 * of rfc 2623.
+	 */
+	if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+			&& exp->ex_path.dentry == dentry)
+		goto skip_pseudoflavor_check;
+
+	error = check_nfsd_access(exp, rqstp);
+	if (error)
+		goto out;
 
+skip_pseudoflavor_check:
 	/* Finally, check access permissions. */
 	error = nfsd_permission(rqstp, exp, dentry, access);
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0766f95d236..5cffeca7ace 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+	nfserr = fh_verify(rqstp, &resp->fh, 0,
+			NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
 
 	dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
 
-	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+	nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+			NFSD_MAY_BYPASS_GSS_ON_ROOT);
 	fh_put(&argp->fh);
 	return nfserr;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 80292ff5e92..59eeb46f82c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,6 +229,7 @@ int nfsd_create_serv(void)
 
 	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+				      AF_INET,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
@@ -243,25 +244,20 @@ static int nfsd_init_socks(int port)
 	if (!list_empty(&nfsd_serv->sv_permsocks))
 		return 0;
 
-	error = lockd_up(IPPROTO_UDP);
-	if (error >= 0) {
-		error = svc_create_xprt(nfsd_serv, "udp", port,
+	error = svc_create_xprt(nfsd_serv, "udp", port,
 					SVC_SOCK_DEFAULTS);
-		if (error < 0)
-			lockd_down();
-	}
 	if (error < 0)
 		return error;
 
-	error = lockd_up(IPPROTO_TCP);
-	if (error >= 0) {
-		error = svc_create_xprt(nfsd_serv, "tcp", port,
+	error = svc_create_xprt(nfsd_serv, "tcp", port,
 					SVC_SOCK_DEFAULTS);
-		if (error < 0)
-			lockd_down();
-	}
 	if (error < 0)
 		return error;
+
+	error = lockd_up();
+	if (error < 0)
+		return error;
+
 	return 0;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 18060bed526..aa1d0d6489a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
 	spinlock_t		pb_lock;
 } ____cacheline_aligned_in_smp;
 
-static struct raparms *		raparml;
 #define RAPARM_HASH_BITS	4
 #define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
 #define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
@@ -1866,9 +1865,9 @@ out:
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
+	__be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
 	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
@@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 void
 nfsd_racache_shutdown(void)
 {
-	if (!raparml)
-		return;
+	struct raparms *raparm, *last_raparm;
+	unsigned int i;
+
 	dprintk("nfsd: freeing readahead buffers.\n");
-	kfree(raparml);
-	raparml = NULL;
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+		raparm = raparm_hash[i].pb_head;
+		while(raparm) {
+			last_raparm = raparm;
+			raparm = raparm->p_next;
+			kfree(last_raparm);
+		}
+		raparm_hash[i].pb_head = NULL;
+	}
 }
 /*
  * Initialize readahead param cache
@@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size)
 	int	i;
 	int	j = 0;
 	int	nperbucket;
+	struct raparms **raparm = NULL;
 
 
-	if (raparml)
+	if (raparm_hash[0].pb_head)
 		return 0;
-	if (cache_size < 2*RAPARM_HASH_SIZE)
-		cache_size = 2*RAPARM_HASH_SIZE;
-	raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
-
-	if (!raparml) {
-		printk(KERN_WARNING
-			"nfsd: Could not allocate memory read-ahead cache.\n");
-		return -ENOMEM;
-	}
+	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+	if (nperbucket < 2)
+		nperbucket = 2;
+	cache_size = nperbucket * RAPARM_HASH_SIZE;
 
 	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
-	for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
-		raparm_hash[i].pb_head = NULL;
+
+	for (i = 0; i < RAPARM_HASH_SIZE; i++) {
 		spin_lock_init(&raparm_hash[i].pb_lock);
-	}
-	nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-	for (i = 0; i < cache_size - 1; i++) {
-		if (i % nperbucket == 0)
-			raparm_hash[j++].pb_head = raparml + i;
-		if (i % nperbucket < nperbucket-1)
-			raparml[i].p_next = raparml + i + 1;
+
+		raparm = &raparm_hash[i].pb_head;
+		for (j = 0; j < nperbucket; j++) {
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			if (!*raparm)
+				goto out_nomem;
+			raparm = &(*raparm)->p_next;
+		}
+		*raparm = NULL;
 	}
 
 	nfsdstats.ra_size = cache_size;
 	return 0;
+
+out_nomem:
+	dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+	nfsd_racache_shutdown();
+	return -ENOMEM;
 }
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fd..589dcdfdfe3 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
 	symlink.o 		\
 	sysfile.o 		\
 	uptodate.o		\
-	ver.o
+	ver.o			\
+	xattr.o
 
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 29ff57ec5d1..0cc2deb9394 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
 
 #include "buffer_head_io.h"
 
+
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+	/*
+	 * last_eb_blk is the block number of the right most leaf extent
+	 * block.  Most on-disk structures containing an extent tree store
+	 * this value for fast access.  The ->eo_set_last_eb_blk() and
+	 * ->eo_get_last_eb_blk() operations access this value.  They are
+	 *  both required.
+	 */
+	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+				   u64 blkno);
+	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * The on-disk structure usually keeps track of how many total
+	 * clusters are stored in this extent tree.  This function updates
+	 * that value.  new_clusters is the delta, and must be
+	 * added to the total.  Required.
+	 */
+	void (*eo_update_clusters)(struct inode *inode,
+				   struct ocfs2_extent_tree *et,
+				   u32 new_clusters);
+
+	/*
+	 * If ->eo_insert_check() exists, it is called before rec is
+	 * inserted into the extent tree.  It is optional.
+	 */
+	int (*eo_insert_check)(struct inode *inode,
+			       struct ocfs2_extent_tree *et,
+			       struct ocfs2_extent_rec *rec);
+	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+
+	/*
+	 * --------------------------------------------------------------
+	 * The remaining are internal to ocfs2_extent_tree and don't have
+	 * accessor functions
+	 */
+
+	/*
+	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+	 * It is required.
+	 */
+	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+	/*
+	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+	 * it exists.  If it does not, et->et_max_leaf_clusters is set
+	 * to 0 (unlimited).  Optional.
+	 */
+	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+					  struct ocfs2_extent_tree *et);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_dinode_update_clusters,
+	.eo_insert_check	= ocfs2_dinode_insert_check,
+	.eo_sanity_check	= ocfs2_dinode_sanity_check,
+	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					 u64 blkno)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+					 struct ocfs2_extent_tree *et,
+					 u32 clusters)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_dinode_insert_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et,
+				     struct ocfs2_extent_rec *rec)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			rec->e_cpos,
+			OCFS2_I(inode)->ip_clusters);
+
+	return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+				     struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+	struct ocfs2_dinode *di;
+
+	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+
+	di = et->et_object;
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ret = -EIO;
+		ocfs2_error(inode->i_sb,
+			"Inode %llu has invalid path root",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
+	}
+
+	return ret;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_dinode *di = et->et_object;
+
+	et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv = et->et_object;
+
+	et->et_root_el = &xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					      u64 blkno)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->et_object;
+
+	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *) et->et_object;
+
+	return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+					      struct ocfs2_extent_tree *et,
+					      u32 clusters)
+{
+	struct ocfs2_xattr_value_root *xv =
+		(struct ocfs2_xattr_value_root *)et->et_object;
+
+	le32_add_cpu(&xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+						    struct ocfs2_extent_tree *et)
+{
+	et->et_max_leaf_clusters =
+		ocfs2_clusters_for_bytes(inode->i_sb,
+					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					     u64 blkno)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+	return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+					     struct ocfs2_extent_tree *et,
+					     u32 clusters)
+{
+	struct ocfs2_xattr_block *xb = et->et_object;
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
+	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
+	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
+	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
+	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+				     struct inode *inode,
+				     struct buffer_head *bh,
+				     void *obj,
+				     struct ocfs2_extent_tree_operations *ops)
+{
+	et->et_ops = ops;
+	et->et_root_bh = bh;
+	if (!obj)
+		obj = (void *)bh->b_data;
+	et->et_object = obj;
+
+	et->et_ops->eo_fill_root_el(et);
+	if (!et->et_ops->eo_fill_max_leaf_clusters)
+		et->et_max_leaf_clusters = 0;
+	else
+		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, NULL,
+				 &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv)
+{
+	__ocfs2_init_extent_tree(et, inode, bh, xv,
+				 &ocfs2_xattr_value_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+					    u64 new_last_eb_blk)
+{
+	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+	return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+					    struct ocfs2_extent_tree *et,
+					    u32 clusters)
+{
+	et->et_ops->eo_update_clusters(inode, et, clusters);
+}
+
+static inline int ocfs2_et_insert_check(struct inode *inode,
+					struct ocfs2_extent_tree *et,
+					struct ocfs2_extent_rec *rec)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_insert_check)
+		ret = et->et_ops->eo_insert_check(inode, et, rec);
+	return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+					struct ocfs2_extent_tree *et)
+{
+	int ret = 0;
+
+	if (et->et_ops->eo_sanity_check)
+		ret = et->et_ops->eo_sanity_check(inode, et);
+	return ret;
+}
+
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
 					 struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 }
 
 /*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
-
-	return ocfs2_new_path(di_bh, el);
-}
-
-/*
  * Convenience function to journal all components in a path.
  */
 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe)
+			   struct ocfs2_extent_tree *et)
 {
 	int retval;
-	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_list *el = NULL;
 	struct ocfs2_extent_block *eb;
 	struct buffer_head *eb_bh = NULL;
+	u64 last_eb_blk = 0;
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		retval = -EIO;
-		goto bail;
-	}
+	el = et->et_root_el;
+	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
-	if (fe->i_last_eb_blk) {
-		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &eb_bh, OCFS2_BH_CACHED, inode);
+	if (last_eb_blk) {
+		retval = ocfs2_read_block(inode, last_eb_blk,
+					  &eb_bh);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
 		}
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
-	} else
-		el = &fe->id2.i_list;
+	}
 
 	BUG_ON(el->l_tree_depth != 0);
 
 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
 bail:
-	if (eb_bh)
-		brelse(eb_bh);
+	brelse(eb_bh);
 
 	mlog_exit(retval);
 	return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 bail:
 	if (status < 0) {
 		for(i = 0; i < wanted; i++) {
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 			bhs[i] = NULL;
 		}
 	}
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
 			    struct inode *inode,
-			    struct buffer_head *fe_bh,
+			    struct ocfs2_extent_tree *et,
 			    struct buffer_head *eb_bh,
 			    struct buffer_head **last_eb_bh,
 			    struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	u64 next_blkno, new_last_eb_blk;
 	struct buffer_head *bh;
 	struct buffer_head **new_eb_bhs = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
 	if (eb_bh) {
 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 		el = &eb->h_list;
 	} else
-		el = &fe->id2.i_list;
+		el = et->et_root_el;
 
 	/* we never add a branch to a leaf. */
 	BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	}
 
 	/* Link the new branch into the rest of the tree (el will
-	 * either be on the fe, or the extent block passed in. */
+	 * either be on the root_bh, or the extent block passed in. */
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 
 	/* fe needs a new last extent block pointer, as does the
 	 * next_leaf on the previously last-extent-block. */
-	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
 
 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
 	if (status < 0)
 		mlog_errno(status);
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0)
 		mlog_errno(status);
 	if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 bail:
 	if (new_eb_bhs) {
 		for (i = 0; i < new_blocks; i++)
-			if (new_eb_bhs[i])
-				brelse(new_eb_bhs[i]);
+			brelse(new_eb_bhs[i]);
 		kfree(new_eb_bhs);
 	}
 
@@ -717,16 +1031,15 @@ bail:
 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 				  handle_t *handle,
 				  struct inode *inode,
-				  struct buffer_head *fe_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_alloc_context *meta_ac,
 				  struct buffer_head **ret_new_eb_bh)
 {
 	int status, i;
 	u32 new_clusters;
 	struct buffer_head *new_eb_bh = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *fe_el;
+	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
 	mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb_el = &eb->h_list;
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	fe_el = &fe->id2.i_list;
+	root_el = et->et_root_el;
 
 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
 				      OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	/* copy the fe data into the new extent block */
-	eb_el->l_tree_depth = fe_el->l_tree_depth;
-	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		eb_el->l_recs[i] = fe_el->l_recs[i];
+	/* copy the root extent list data into the new extent block */
+	eb_el->l_tree_depth = root_el->l_tree_depth;
+	eb_el->l_next_free_rec = root_el->l_next_free_rec;
+	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = root_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 
 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
 
-	/* update fe now */
-	le16_add_cpu(&fe_el->l_tree_depth, 1);
-	fe_el->l_recs[0].e_cpos = 0;
-	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
-		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
-	fe_el->l_next_free_rec = cpu_to_le16(1);
+	/* update root_bh now */
+	le16_add_cpu(&root_el->l_tree_depth, 1);
+	root_el->l_recs[0].e_cpos = 0;
+	root_el->l_recs[0].e_blkno = eb->h_blkno;
+	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+	root_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
 	 * becomes the allocated extent block */
-	if (fe_el->l_tree_depth == cpu_to_le16(1))
-		fe->i_last_eb_blk = eb->h_blkno;
+	if (root_el->l_tree_depth == cpu_to_le16(1))
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
-	status = ocfs2_journal_dirty(handle, fe_bh);
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	new_eb_bh = NULL;
 	status = 0;
 bail:
-	if (new_eb_bh)
-		brelse(new_eb_bh);
+	brelse(new_eb_bh);
 
 	mlog_exit(status);
 	return status;
@@ -817,22 +1128,21 @@ bail:
  * 1) a lowest extent block is found, then we pass it back in
  *    *lowest_eb_bh and return '0'
  *
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
  *    pass NULL back in *lowest_eb_bh, but still return '0'
  *
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
  *    which case we return > 0
  *
  * return status < 0 indicates an error.
  */
 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 				    struct inode *inode,
-				    struct buffer_head *fe_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **target_bh)
 {
 	int status = 0, i;
 	u64 blkno;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *el;
 	struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	*target_bh = NULL;
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
+	el = et->et_root_el;
 
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		if (bh) {
-			brelse(bh);
-			bh = NULL;
-		}
+		brelse(bh);
+		bh = NULL;
 
-		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
-					  inode);
+		status = ocfs2_read_block(inode, blkno, &bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 		if (le16_to_cpu(el->l_next_free_rec) <
 		    le16_to_cpu(el->l_count)) {
-			if (lowest_bh)
-				brelse(lowest_bh);
+			brelse(lowest_bh);
 			lowest_bh = bh;
 			get_bh(lowest_bh);
 		}
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 
 	/* If we didn't find one and the fe doesn't have any room,
 	 * then return '1' */
-	if (!lowest_bh
-	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+	el = et->et_root_el;
+	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
 		status = 1;
 
 	*target_bh = lowest_bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -919,19 +1223,19 @@ bail:
  * *last_eb_bh will be updated by ocfs2_add_branch().
  */
 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
-			   struct buffer_head *di_bh, int *final_depth,
+			   struct ocfs2_extent_tree *et, int *final_depth,
 			   struct buffer_head **last_eb_bh,
 			   struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, shift;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+	struct ocfs2_extent_list *el = et->et_root_el;
+	int depth = le16_to_cpu(el->l_tree_depth);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct buffer_head *bh = NULL;
 
 	BUG_ON(meta_ac == NULL);
 
-	shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
 	if (shift < 0) {
 		ret = shift;
 		mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
 		 * ocfs2_add_branch). */
-		ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
 					     meta_ac, &bh);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
 	mlog(0, "add branch. bh = %p\n", bh);
-	ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -1236,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
-				       &bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2058,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 				     struct ocfs2_path *right_path,
 				     int subtree_index,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     int *deleted)
+				     int *deleted,
+				     struct ocfs2_extent_tree *et)
 {
 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
-	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
 	struct ocfs2_extent_block *eb;
 
@@ -2114,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -2189,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 
 		/*
 		 * Removal of the extent in the left leaf was skipped
@@ -2199,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		if (right_has_empty)
 			ocfs2_remove_empty_extent(left_leaf_el);
 
-		ret = ocfs2_journal_dirty(handle, di_bh);
+		ret = ocfs2_journal_dirty(handle, et_root_bh);
 		if (ret)
 			mlog_errno(ret);
 
@@ -2322,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 				    handle_t *handle, int orig_credits,
 				    struct ocfs2_path *path,
 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
-				    struct ocfs2_path **empty_extent_path)
+				    struct ocfs2_path **empty_extent_path,
+				    struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_root, deleted;
 	u32 right_cpos;
@@ -2395,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
 						right_path, subtree_root,
-						dealloc, &deleted);
+						dealloc, &deleted, et);
 		if (ret == -EAGAIN) {
 			/*
 			 * The rotation has to temporarily stop due to
@@ -2438,29 +2742,20 @@ out:
 }
 
 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
-				       struct ocfs2_path *path,
-				       struct ocfs2_cached_dealloc_ctxt *dealloc)
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et)
 {
 	int ret, subtree_index;
 	u32 cpos;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	/*
-	 * XXX: This code assumes that the root is an inode, which is
-	 * true for now but may change as tree code gets generic.
-	 */
-	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has invalid path root",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto out;
-	}
 
+	ret = ocfs2_et_sanity_check(inode, et);
+	if (ret)
+		goto out;
 	/*
 	 * There's two ways we handle this depending on
 	 * whether path is the only existing one.
@@ -2517,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		ocfs2_update_edge_lengths(inode, handle, left_path);
 
 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
-		di->i_last_eb_blk = eb->h_blkno;
+		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
 	} else {
 		/*
 		 * 'path' is also the leftmost path which
@@ -2528,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 */
 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
 
-		el = &di->id2.i_list;
+		el = et->et_root_el;
 		el->l_tree_depth = 0;
 		el->l_next_free_rec = 0;
 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
 
-		di->i_last_eb_blk = 0;
+		ocfs2_et_set_last_eb_blk(et, 0);
 	}
 
 	ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2561,7 +2856,8 @@ out:
  */
 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct ocfs2_cached_dealloc_ctxt *dealloc)
+				  struct ocfs2_cached_dealloc_ctxt *dealloc,
+				  struct ocfs2_extent_tree *et)
 {
 	int ret, orig_credits = handle->h_buffer_credits;
 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2575,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
 	if (path->p_tree_depth == 0) {
 rightmost_no_delete:
 		/*
-		 * In-inode extents. This is trivially handled, so do
+		 * Inline extents. This is trivially handled, so do
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2629,7 +2925,7 @@ rightmost_no_delete:
 		 */
 
 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
-						  dealloc);
+						  dealloc, et);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -2641,7 +2937,7 @@ rightmost_no_delete:
 	 */
 try_rotate:
 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
-				       dealloc, &restart_path);
+				       dealloc, &restart_path, et);
 	if (ret && ret != -EAGAIN) {
 		mlog_errno(ret);
 		goto out;
@@ -2653,7 +2949,7 @@ try_rotate:
 
 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
 					       tmp_path, dealloc,
-					       &restart_path);
+					       &restart_path, et);
 		if (ret && ret != -EAGAIN) {
 			mlog_errno(ret);
 			goto out;
@@ -2939,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 				handle_t *handle,
 				struct ocfs2_extent_rec *split_rec,
 				struct ocfs2_cached_dealloc_ctxt *dealloc,
+				struct ocfs2_extent_tree *et,
 				int index)
 {
 	int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3059,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		    le16_to_cpu(el->l_next_free_rec) == 1) {
 
 			ret = ocfs2_remove_rightmost_path(inode, handle,
-							  right_path, dealloc);
+							  right_path,
+							  dealloc, et);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3086,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     int split_index,
 				     struct ocfs2_extent_rec *split_rec,
 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
-				     struct ocfs2_merge_ctxt *ctxt)
+				     struct ocfs2_merge_ctxt *ctxt,
+				     struct ocfs2_extent_tree *et)
 
 {
 	int ret = 0;
@@ -3104,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 * illegal.
 		 */
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3147,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
 
 		/* The merge left us with an empty extent, remove it. */
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path,
+					     dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3161,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		 */
 		ret = ocfs2_merge_rec_left(inode, path,
 					   handle, rec,
-					   dealloc,
+					   dealloc, et,
 					   split_index);
 
 		if (ret) {
@@ -3170,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 		}
 
 		ret = ocfs2_rotate_tree_left(inode, handle, path,
-					     dealloc);
+					     dealloc, et);
 		/*
 		 * Error from this last rotate is not critical, so
 		 * print but don't bubble it up.
@@ -3190,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			ret = ocfs2_merge_rec_left(inode,
 						   path,
 						   handle, split_rec,
-						   dealloc,
+						   dealloc, et,
 						   split_index);
 			if (ret) {
 				mlog_errno(ret);
@@ -3213,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 			 * our leaf. Try to rotate it away.
 			 */
 			ret = ocfs2_rotate_tree_left(inode, handle, path,
-						     dealloc);
+						     dealloc, et);
 			if (ret)
 				mlog_errno(ret);
 			ret = 0;
@@ -3347,16 +3647,6 @@ rotate:
 	ocfs2_rotate_leaf(el, insert_rec);
 }
 
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
-						struct ocfs2_dinode *di,
-						u32 clusters)
-{
-	le32_add_cpu(&di->i_clusters, clusters);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
 static void ocfs2_adjust_rightmost_records(struct inode *inode,
 					   handle_t *handle,
 					   struct ocfs2_path *path,
@@ -3558,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
 }
 
 /*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
  *
  * right_path is the path we want to do the actual insert
  * in. left_path should only be passed in if we need to update that
@@ -3656,7 +3946,7 @@ out:
 
 static int ocfs2_do_insert_extent(struct inode *inode,
 				  handle_t *handle,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct ocfs2_extent_rec *insert_rec,
 				  struct ocfs2_insert_type *type)
 {
@@ -3664,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 	u32 cpos;
 	struct ocfs2_path *right_path = NULL;
 	struct ocfs2_path *left_path = NULL;
-	struct ocfs2_dinode *di;
 	struct ocfs2_extent_list *el;
 
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	el = &di->id2.i_list;
+	el = et->et_root_el;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
+	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3682,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_inode_path(di_bh);
+	right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3732,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, di_bh,
+		ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
@@ -3757,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
 	if (type->ins_split == SPLIT_NONE)
-		ocfs2_update_dinode_clusters(inode, di,
-					     le16_to_cpu(insert_rec->e_leaf_clusters));
+		ocfs2_et_update_clusters(inode, et,
+					 le16_to_cpu(insert_rec->e_leaf_clusters));
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
+	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
 	if (ret)
 		mlog_errno(ret);
 
@@ -3890,7 +4178,8 @@ out:
 static void ocfs2_figure_contig_type(struct inode *inode,
 				     struct ocfs2_insert_type *insert,
 				     struct ocfs2_extent_list *el,
-				     struct ocfs2_extent_rec *insert_rec)
+				     struct ocfs2_extent_rec *insert_rec,
+				     struct ocfs2_extent_tree *et)
 {
 	int i;
 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3906,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
 		}
 	}
 	insert->ins_contig = contig_type;
+
+	if (insert->ins_contig != CONTIG_NONE) {
+		struct ocfs2_extent_rec *rec =
+				&el->l_recs[insert->ins_contig_index];
+		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+				   le16_to_cpu(insert_rec->e_leaf_clusters);
+
+		/*
+		 * Caller might want us to limit the size of extents, don't
+		 * calculate contiguousness if we might exceed that limit.
+		 */
+		if (et->et_max_leaf_clusters &&
+		    (len > et->et_max_leaf_clusters))
+			insert->ins_contig = CONTIG_NONE;
+	}
 }
 
 /*
@@ -3914,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
  * ocfs2_figure_appending_type() will figure out whether we'll have to
  * insert at the tail of the rightmost leaf.
  *
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
  * then the logic here makes sense.
  */
 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3966,14 +4270,13 @@ set_tail_append:
  * structure.
  */
 static int ocfs2_figure_insert_type(struct inode *inode,
-				    struct buffer_head *di_bh,
+				    struct ocfs2_extent_tree *et,
 				    struct buffer_head **last_eb_bh,
 				    struct ocfs2_extent_rec *insert_rec,
 				    int *free_records,
 				    struct ocfs2_insert_type *insert)
 {
 	int ret;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct ocfs2_path *path = NULL;
@@ -3981,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 
 	insert->ins_split = SPLIT_NONE;
 
-	el = &di->id2.i_list;
+	el = et->et_root_el;
 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
 
 	if (el->l_tree_depth) {
@@ -3991,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk), &bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4014,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		le16_to_cpu(el->l_next_free_rec);
 
 	if (!insert->ins_tree_depth) {
-		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 		ocfs2_figure_appending_type(insert, el, insert_rec);
 		return 0;
 	}
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4048,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
          *     into two types of appends: simple record append, or a
          *     rotate inside the tail leaf.
 	 */
-	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
 
 	/*
 	 * The insert code isn't quite ready to deal with all cases of
@@ -4069,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * the case that we're doing a tail append, so maybe we can
 	 * take advantage of that information somehow.
 	 */
-	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+	if (ocfs2_et_get_last_eb_blk(et) ==
+	    path_leaf_bh(path)->b_blocknr) {
 		/*
 		 * Ok, ocfs2_find_path() returned us the rightmost
 		 * tree path. This might be an appending insert. There are
@@ -4099,7 +4401,7 @@ out:
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct ocfs2_extent_tree *et,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
@@ -4112,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
-	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
-
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-			(OCFS2_I(inode)->ip_clusters != cpos),
-			"Device %s, asking for sparse allocation: inode %llu, "
-			"cpos %u, clusters %u\n",
-			osb->dev_str,
-			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
-			OCFS2_I(inode)->ip_clusters);
-
 	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 	rec.e_flags = flags;
+	status = ocfs2_et_insert_check(inode, et, &rec);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
 
-	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
 					  &free_records, &insert);
 	if (status < 0) {
 		mlog_errno(status);
@@ -4145,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	     free_records, insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
-		status = ocfs2_grow_tree(inode, handle, fe_bh,
+		status = ocfs2_grow_tree(inode, handle, et,
 					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
 		if (status) {
@@ -4155,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	}
 
 	/* Finally, we can add clusters. This might rotate the tree for us. */
-	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
-	else
+	else if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
-	if (last_eb_bh)
-		brelse(last_eb_bh);
+	brelse(last_eb_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret)
+{
+	int status = 0;
+	int free_extents;
+	enum ocfs2_alloc_restarted reason = RESTART_NONE;
+	u32 bit_off, num_bits;
+	u64 block;
+	u8 flags = 0;
+
+	BUG_ON(!clusters_to_add);
+
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
+	free_extents = ocfs2_num_free_extents(osb, inode, et);
+	if (free_extents < 0) {
+		status = free_extents;
+		mlog_errno(status);
+		goto leave;
+	}
+
+	/* there are two cases which could cause us to EAGAIN in the
+	 * we-need-more-metadata case:
+	 * 1) we haven't reserved *any*
+	 * 2) we are so fragmented, we've needed to add metadata too
+	 *    many times. */
+	if (!free_extents && !meta_ac) {
+		mlog(0, "we haven't reserved any metadata!\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	} else if ((!free_extents)
+		   && (ocfs2_alloc_context_bits_left(meta_ac)
+		       < ocfs2_extend_meta_needed(et->et_root_el))) {
+		mlog(0, "filesystem is really fragmented...\n");
+		status = -EAGAIN;
+		reason = RESTART_META;
+		goto leave;
+	}
+
+	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+					clusters_to_add, &bit_off, &num_bits);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
 
+	/* reserve our write early -- insert_extent may update the inode */
+	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	status = ocfs2_insert_extent(osb, handle, inode, et,
+				     *logical_offset, block,
+				     num_bits, flags, meta_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
+
+	if (clusters_to_add) {
+		mlog(0, "need to alloc once more, wanted = %u\n",
+		     clusters_to_add);
+		status = -EAGAIN;
+		reason = RESTART_TRANS;
+	}
+
+leave:
 	mlog_exit(status);
+	if (reason_ret)
+		*reason_ret = reason;
 	return status;
 }
 
@@ -4192,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
 static int ocfs2_split_and_insert(struct inode *inode,
 				  handle_t *handle,
 				  struct ocfs2_path *path,
-				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_tree *et,
 				  struct buffer_head **last_eb_bh,
 				  int split_index,
 				  struct ocfs2_extent_rec *orig_split_rec,
@@ -4206,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
 	struct ocfs2_insert_type insert;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_dinode *di;
 
 leftright:
 	/*
@@ -4215,8 +4618,7 @@ leftright:
 	 */
 	rec = path_leaf_el(path)->l_recs[split_index];
 
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-	rightmost_el = &di->id2.i_list;
+	rightmost_el = et->et_root_el;
 
 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
 	if (depth) {
@@ -4227,8 +4629,8 @@ leftright:
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
-				      meta_ac);
+		ret = ocfs2_grow_tree(inode, handle, et,
+				      &depth, last_eb_bh, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4265,8 +4667,7 @@ leftright:
 		do_leftright = 1;
 	}
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
-				     &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4308,8 +4709,9 @@ out:
  * of the tree is required. All other cases will degrade into a less
  * optimal tree layout.
  *
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
  *
  * This code is optimized for readability - several passes might be
  * made over certain portions of the tree. All of those blocks will
@@ -4317,7 +4719,7 @@ out:
  * extra overhead is not expressed in terms of disk reads.
  */
 static int __ocfs2_mark_extent_written(struct inode *inode,
-				       struct buffer_head *di_bh,
+				       struct ocfs2_extent_tree *et,
 				       handle_t *handle,
 				       struct ocfs2_path *path,
 				       int split_index,
@@ -4357,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	 */
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
-		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4394,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 		if (ctxt.c_split_covers_rec)
 			el->l_recs[split_index] = *split_rec;
 		else
-			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
 						     split_rec, meta_ac);
 		if (ret)
@@ -4402,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	} else {
 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
 						split_index, split_rec,
-						dealloc, &ctxt);
+						dealloc, &ctxt, et);
 		if (ret)
 			mlog_errno(ret);
 	}
@@ -4420,7 +4820,8 @@ out:
  *
  * The caller is responsible for passing down meta_ac if we'll need it.
  */
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4446,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 	/*
 	 * XXX: This should be fixed up so that we just re-insert the
 	 * next extent records.
+	 *
+	 * XXX: This is a hack on the extent tree, maybe it should be
+	 * an op?
 	 */
-	ocfs2_extent_map_trunc(inode, 0);
+	if (et->et_ops == &ocfs2_dinode_et_ops)
+		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_inode_path(di_bh);
+	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4480,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
 
-	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
-					  index, &split_rec, meta_ac, dealloc);
+	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+					  index, &split_rec, meta_ac,
+					  dealloc);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4490,13 +4896,12 @@ out:
 	return ret;
 }
 
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 			    handle_t *handle, struct ocfs2_path *path,
 			    int index, u32 new_range,
 			    struct ocfs2_alloc_context *meta_ac)
 {
 	int ret, depth, credits = handle->h_buffer_credits;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *rightmost_el, *el;
@@ -4513,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       le64_to_cpu(di->i_last_eb_blk),
-				       &last_eb_bh, OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+				       &last_eb_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -4526,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	} else
 		rightmost_el = path_leaf_el(path);
 
-	credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+	credits += path->p_tree_depth +
+		   ocfs2_extend_meta_needed(et->et_root_el);
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4535,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4549,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 	insert.ins_split = SPLIT_RIGHT;
 	insert.ins_tree_depth = depth;
 
-	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4561,7 +4966,8 @@ out:
 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 			      struct ocfs2_path *path, int index,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
-			      u32 cpos, u32 len)
+			      u32 cpos, u32 len,
+			      struct ocfs2_extent_tree *et)
 {
 	int ret;
 	u32 left_cpos, rec_range, trunc_range;
@@ -4573,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 	struct ocfs2_extent_block *eb;
 
 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
-		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4704,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 
 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
 
-	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4715,7 +5121,8 @@ out:
 	return ret;
 }
 
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4724,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 	u32 rec_range, trunc_range;
 	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_list *el;
-	struct ocfs2_path *path;
+	struct ocfs2_path *path = NULL;
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_inode_path(di_bh);
+	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4781,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	} else {
-		ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+		ret = ocfs2_split_tree(inode, et, handle, path, index,
 				       trunc_range, meta_ac);
 		if (ret) {
 			mlog_errno(ret);
@@ -4836,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
 		}
 
 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
-					 cpos, len);
+					 cpos, len, et);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -5179,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
@@ -5255,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 bail:
 	if (tl_inode)
 		iput(tl_inode);
-	if (tl_bh)
-		brelse(tl_bh);
+	brelse(tl_bh);
 
 	if (status < 0 && (*tl_copy)) {
 		kfree(*tl_copy);
@@ -5999,20 +6404,13 @@ bail:
 	return status;
 }
 
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
 {
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
 	return 0;
 }
 
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
-	set_buffer_uptodate(bh);
-	mark_buffer_dirty(bh);
-	return ocfs2_journal_dirty_data(handle, bh);
-}
-
 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 				     unsigned int from, unsigned int to,
 				     struct page *page, int zero, u64 *phys)
@@ -6031,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 	 * here if they aren't - ocfs2_map_page_blocks()
 	 * might've skipped some
 	 */
-	if (ocfs2_should_order_data(inode)) {
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, &partial,
-					ocfs2_ordered_zero_func);
-		if (ret < 0)
-			mlog_errno(ret);
-	} else {
+	ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, &partial,
+				ocfs2_zero_func);
+	if (ret < 0)
+		mlog_errno(ret);
+	else if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle, page_buffers(page),
 					from, to, &partial,
-					ocfs2_writeback_zero_func);
+					ocfs2_journal_dirty_data);
+#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -6206,20 +6605,29 @@ out:
 	return ret;
 }
 
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+					     struct ocfs2_dinode *di)
 {
 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
 
-	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2) -
+				    xattrsize);
+	else
+		memset(&di->id2, 0, blocksize -
+				    offsetof(struct ocfs2_dinode, id2));
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode,
 				  struct ocfs2_dinode *di)
 {
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 	di->id2.i_list.l_tree_depth = 0;
 	di->id2.i_list.l_next_free_rec = 0;
-	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+	di->id2.i_list.l_count = cpu_to_le16(
+		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
 }
 
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6236,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
 	 * We clear the entire i_data structure here so that all
 	 * fields can be properly initialized.
 	 */
-	ocfs2_zero_dinode_id2(inode, di);
+	ocfs2_zero_dinode_id2_with_xattr(inode, di);
 
-	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+	idata->id_count = cpu_to_le16(
+			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
 }
 
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6253,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
+	struct ocfs2_extent_tree et;
 
 	has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6352,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * this proves to be false, we could always re-build
 		 * the in-inode data from our pages.
 		 */
-		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+		ret = ocfs2_insert_extent(osb, handle, inode, &et,
 					  0, block, 1, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
@@ -6395,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_path *path = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
 
 	mlog_entry_void();
 
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_inode_path(fe_bh);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -6572,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
 	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+					  &last_eb_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -6686,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 		mlog(ML_NOTICE,
 		     "Truncate completion has non-empty dealloc context\n");
 
-	if (tc->tc_last_eb_bh)
-		brelse(tc->tc_last_eb_bh);
+	brelse(tc->tc_last_eb_bh);
 
 	kfree(tc);
 }
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 60cd3d59230..70257c84cfb 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
 #ifndef OCFS2_ALLOC_H
 #define OCFS2_ALLOC_H
 
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation.  Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+	struct ocfs2_extent_tree_operations	*et_ops;
+	struct buffer_head			*et_root_bh;
+	struct ocfs2_extent_list		*et_root_el;
+	void					*et_object;
+	unsigned int				et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+				   struct inode *inode,
+				   struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+				       struct inode *inode,
+				       struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+					struct inode *inode,
+					struct buffer_head *bh,
+					struct ocfs2_xattr_value_root *xv);
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
-			struct buffer_head *fe_bh,
+			struct ocfs2_extent_tree *et,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
 			u8 flags,
 			struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+	RESTART_NONE = 0,
+	RESTART_TRANS,
+	RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+				struct inode *inode,
+				u32 *logical_offset,
+				u32 clusters_to_add,
+				int mark_unwritten,
+				struct ocfs2_extent_tree *et,
+				handle_t *handle,
+				struct ocfs2_alloc_context *data_ac,
+				struct ocfs2_alloc_context *meta_ac,
+				enum ocfs2_alloc_restarted *reason_ret);
 struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+			      struct ocfs2_extent_tree *et,
 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
 			      struct ocfs2_alloc_context *meta_ac,
 			      struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+			struct ocfs2_extent_tree *et,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
-			   struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+			   struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
 {
 	/*
 	 * Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	 * new tree_depth==0 extent_block, and one block at the new
 	 * top-of-the tree.
 	 */
-	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+	return le16_to_cpu(root_el->l_tree_depth) + 2;
 }
 
 void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a53da146627..c22543b3342 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  &bh, OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	err = 0;
 
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(err);
 	return err;
@@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 {
 	int ret;
 	struct buffer_head *di_bh = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
-			       OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 	}
 
 	if (ocfs2_should_order_data(inode)) {
+		ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 		ret = walk_page_buffers(handle,
 					page_buffers(page),
 					from, to, NULL,
 					ocfs2_journal_dirty_data);
-		if (ret < 0) 
+#endif
+		if (ret < 0)
 			mlog_errno(ret);
 	}
 out:
@@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
 {
 	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
 
-	journal_invalidatepage(journal, page, offset);
+	jbd2_journal_invalidatepage(journal, page, offset);
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
 
 	if (!page_has_buffers(page))
 		return 0;
-	return journal_try_to_free_buffers(journal, page, wait);
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
 }
 
 static ssize_t ocfs2_direct_IO(int rw,
@@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
+	struct ocfs2_extent_tree et;
 
 	new = phys == 0 ? 1 : 0;
 	if (new || unwritten)
@@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 * any additional semaphores or cluster locks.
 		 */
 		tmp_pos = cpos;
-		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, 0, wc->w_di_bh,
-						 wc->w_handle, data_ac,
-						 meta_ac, NULL);
+		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+					   &tmp_pos, 1, 0, wc->w_di_bh,
+					   wc->w_handle, data_ac,
+					   meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			goto out;
 		}
 	} else if (unwritten) {
-		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_mark_extent_written(inode, &et,
 						wc->w_handle, cpos, 1, phys,
 						meta_ac, &wc->w_dealloc);
 		if (ret < 0) {
@@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
 
 	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
@@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    extents_to_split, &data_ac, &meta_ac);
+		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+		     " clusters_to_add = %u, extents_to_split = %u\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+		     clusters_to_alloc, extents_to_split);
+
+		ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+		ret = ocfs2_lock_allocators(inode, &et,
+					    clusters_to_alloc, extents_to_split,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+		credits = ocfs2_calc_extend_credits(inode->i_sb,
+						    &di->id2.i_list,
 						    clusters_to_alloc);
 
 	}
@@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode))
+			if (ocfs2_should_order_data(inode)) {
+				ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 				walk_page_buffers(wc->w_handle,
 						  page_buffers(tmppage),
 						  from, to, NULL,
 						  ocfs2_journal_dirty_data);
+#endif
+			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b4..7e947c67246 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 	/* remove from dirty list before I/O. */
 	clear_buffer_dirty(bh);
 
-	get_bh(bh); /* for end_buffer_write_sync() */                   
+	get_bh(bh); /* for end_buffer_write_sync() */
 	bh->b_end_io = end_buffer_write_sync;
 	submit_bh(WRITE, bh);
 
@@ -88,22 +88,103 @@ out:
 	return ret;
 }
 
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags,
-		      struct inode *inode)
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[])
+{
+	int status = 0;
+	unsigned int i;
+	struct buffer_head *bh;
+
+	if (!nr) {
+		mlog(ML_BH_IO, "No buffers will be read!\n");
+		goto bail;
+	}
+
+	for (i = 0 ; i < nr ; i++) {
+		if (bhs[i] == NULL) {
+			bhs[i] = sb_getblk(osb->sb, block++);
+			if (bhs[i] == NULL) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+		}
+		bh = bhs[i];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "trying to sync read a jbd "
+			     "managed bh (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		if (buffer_dirty(bh)) {
+			/* This should probably be a BUG, or
+			 * at least return an error. */
+			mlog(ML_ERROR,
+			     "trying to sync read a dirty "
+			     "buffer! (blocknr = %llu), skipping\n",
+			     (unsigned long long)bh->b_blocknr);
+			continue;
+		}
+
+		lock_buffer(bh);
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "block %llu had the JBD bit set "
+			     "while I was in lock_buffer!",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		clear_buffer_uptodate(bh);
+		get_bh(bh); /* for end_buffer_read_sync() */
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	}
+
+	for (i = nr; i > 0; i--) {
+		bh = bhs[i - 1];
+
+		if (buffer_jbd(bh)) {
+			mlog(ML_ERROR,
+			     "the journal got the buffer while it was "
+			     "locked for io! (blocknr = %llu)\n",
+			     (unsigned long long)bh->b_blocknr);
+			BUG();
+		}
+
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* Status won't be cleared from here on out,
+			 * so we can safely record this and loop back
+			 * to cleanup the other buffers. */
+			status = -EIO;
+			put_bh(bh);
+			bhs[i - 1] = NULL;
+		}
+	}
+
+bail:
+	return status;
+}
+
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags)
 {
 	int status = 0;
-	struct super_block *sb;
 	int i, ignore_cache = 0;
 	struct buffer_head *bh;
 
-	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
-		   (unsigned long long)block, nr, flags, inode);
+	mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+		   inode, (unsigned long long)block, nr, flags);
 
+	BUG_ON(!inode);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
-	       (!inode || !(flags & OCFS2_BH_CACHED)));
+	       (flags & OCFS2_BH_IGNORE_CACHE));
 
-	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+	if (bhs == NULL) {
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		goto bail;
 	}
 
-	sb = osb->sb;
-
-	if (flags & OCFS2_BH_CACHED && !inode)
-		flags &= ~OCFS2_BH_CACHED;
-
-	if (inode)
-		mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
-			bhs[i] = sb_getblk(sb, block++);
+			bhs[i] = sb_getblk(inode->i_sb, block++);
 			if (bhs[i] == NULL) {
-				if (inode)
-					mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+				mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 				status = -EIO;
 				mlog_errno(status);
 				goto bail;
 			}
 		}
 		bh = bhs[i];
-		ignore_cache = 0;
+		ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
 
 		/* There are three read-ahead cases here which we need to
 		 * be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		 *    before our is-it-in-flight check.
 		 */
 
-		if (flags & OCFS2_BH_CACHED &&
-		    !ocfs2_buffer_uptodate(inode, bh)) {
+		if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
 			mlog(ML_UPTODATE,
 			     "bh (%llu), inode %llu not uptodate\n",
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			/* We're using ignore_cache here to say
+			 * "go to disk" */
 			ignore_cache = 1;
 		}
 
 		/* XXX: Can we ever get this and *not* have the cached
 		 * flag set? */
 		if (buffer_jbd(bh)) {
-			if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+			if (ignore_cache)
 				mlog(ML_BH_IO, "trying to sync read a jbd "
 					       "managed bh (blocknr = %llu)\n",
 				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
-		if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+		if (ignore_cache) {
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 			 * previously read-ahead buffer may have
 			 * completed I/O while we were waiting for the
 			 * buffer lock. */
-			if ((flags & OCFS2_BH_CACHED)
+			if (!(flags & OCFS2_BH_IGNORE_CACHE)
 			    && !(flags & OCFS2_BH_READAHEAD)
 			    && ocfs2_buffer_uptodate(inode, bh)) {
 				unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		/* Always set the buffer in the cache, even if it was
 		 * a forced read, or read-ahead which hasn't yet
 		 * completed. */
-		if (inode)
-			ocfs2_set_buffer_uptodate(inode, bh);
+		ocfs2_set_buffer_uptodate(inode, bh);
 	}
-	if (inode)
-		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
-	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+	     flags);
 
 bail:
 
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e..75e1dcb1ade 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 
-static inline int ocfs2_read_block(struct ocfs2_super          *osb,
+static inline int ocfs2_read_block(struct inode	       *inode,
 				   u64                  off,
-				   struct buffer_head **bh,
-				   int                  flags,
-				   struct inode        *inode);
+				   struct buffer_head **bh);
 
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct ocfs2_super          *osb,
+int ocfs2_read_blocks(struct inode	  *inode,
 		      u64                  block,
 		      int                  nr,
 		      struct buffer_head  *bhs[],
-		      int                  flags,
-		      struct inode        *inode);
+		      int                  flags);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+			   unsigned int nr, struct buffer_head *bhs[]);
 
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 
-#define OCFS2_BH_CACHED            1
+#define OCFS2_BH_IGNORE_CACHE      1
 #define OCFS2_BH_READAHEAD         8
 
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
-				   struct buffer_head **bh, int flags,
-				   struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+				   struct buffer_head **bh)
 {
 	int status = 0;
 
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(osb, off, 1, bh,
-				   flags, inode);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f2752..d8a0cb92cef 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
+	define_mask(XATTR),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94..57670c68047 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 9cce563fd62..026e6eb8518 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+				       int block, int *err, int reada)
+{
+	struct buffer_head *bh = NULL;
+	int tmperr;
+	u64 p_blkno;
+	int readflags = 0;
+
+	if (reada)
+		readflags |= OCFS2_BH_READAHEAD;
+
+	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!reada);
+		return NULL;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (tmperr < 0) {
+		mlog_errno(tmperr);
+		goto fail;
+	}
+
+	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+	if (tmperr < 0)
+		goto fail;
+
+	tmperr = 0;
+
+	*err = 0;
+	return bh;
+
+fail:
+	brelse(bh);
+	bh = NULL;
+
+	*err = -EIO;
+	return NULL;
+}
+
 /*
  * bh passed here can be an inode block or a dir data block, depending
  * on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -260,14 +302,13 @@ restart:
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
+		if (ocfs2_read_block(dir, block, &bh)) {
+			/* read error, skip block & hope for the best.
+			 * ocfs2_read_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
 				    block);
-			brelse(bh);
 			goto next;
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, dir);
+	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				if (tmp)
-					brelse(tmp);
+				brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
 leave:
 	if (status < 0) {
 		*dirent = NULL;
-		if (*dirent_bh) {
-			brelse(*dirent_bh);
-			*dirent_bh = NULL;
-		}
+		brelse(*dirent_bh);
+		*dirent_bh = NULL;
 	}
 
 	mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 
 	ret = 0;
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
+	brelse(dirent_bh);
 
 	mlog_exit(ret);
 	return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	status = 0;
 bail:
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct buffer_head *dirdata_bh = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
 	alloc = ocfs2_clusters_for_bytes(sb, bytes);
 
@@ -1305,8 +1342,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * This should never fail as our extent list is empty and all
 	 * related blocks have been journaled already.
 	 */
-	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
-				  NULL);
+	ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+				  0, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1337,8 +1374,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		}
 		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
 
-		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
-					  len, 0, NULL);
+		ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+					  blkno, len, 0, NULL);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_commit;
@@ -1383,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, 0, parent_fe_bh, handle,
-						    data_ac, meta_ac, NULL);
+		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+					      1, 0, parent_fe_bh, handle,
+					      data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
 			mlog_errno(status);
@@ -1430,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	int credits, num_free_extents, drop_alloc_sem = 0;
 	loff_t dir_i_size;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+	struct ocfs2_extent_list *el = &fe->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry * de;
 	struct super_block *sb = osb->sb;
+	struct ocfs2_extent_tree et;
 
 	mlog_entry_void();
 
@@ -1479,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
-		num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+		ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+		num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
 		if (num_free_extents < 0) {
 			status = num_free_extents;
 			mlog_errno(status);
@@ -1487,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		}
 
 		if (!num_free_extents) {
-			status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+			status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
 			if (status < 0) {
 				if (status != -ENOSPC)
 					mlog_errno(status);
@@ -1502,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		credits = ocfs2_calc_extend_credits(sb, fe, 1);
+		credits = ocfs2_calc_extend_credits(sb, el, 1);
 	} else {
 		spin_unlock(&OCFS2_I(dir)->ip_lock);
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1568,8 +1608,7 @@ bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
-	if (new_bh)
-		brelse(new_bh);
+	brelse(new_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1696,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 
 	status = 0;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1756,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	*ret_de_bh = bh;
 	bh = NULL;
 out:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 	return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index eae3d643a5e..ec684426034 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
-					  bh, OCFS2_BH_CACHED, inode);
+		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  ret_bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index aed268e80b4..2baedac5823 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk,
-			       &eb_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -382,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+		ret = ocfs2_read_block(inode,
 				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+				       &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -551,6 +550,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
 		*num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
 }
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec;
+	u32 coff;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		ret = -EROFS;
+		mlog_errno(ret);
+		goto out;
+	} else {
+		rec = &el->l_recs[i];
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0) in xattr", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+	}
+out:
+	if (eb_bh)
+		brelse(eb_bh);
+	return ret;
+}
+
 int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 		       u32 *p_cluster, u32 *num_clusters,
 		       unsigned int *extent_flags)
@@ -571,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
-			       &di_bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1b97490e1ea..1c4aa8b06f3 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 u64 map_start, u64 map_len);
 
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+			     u32 *p_cluster, u32 *num_clusters,
+			     struct ocfs2_extent_list *el);
+
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed38796052d..8d3225a7807 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
 #include "mmap.h"
 #include "suballoc.h"
 #include "super.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
 		goto bail;
 
 	journal = osb->journal->j_journal;
-	err = journal_force_commit(journal);
+	err = jbd2_journal_force_commit(journal);
 
 bail:
 	mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
 }
 
 /*
- * extend allocation only here.
+ * extend file allocation only here.
  * we'll update all the disk stuff, and oip->alloc_size
  *
  * expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
  * Will return -EAGAIN, and a reason if a restart is needed.
  * If passed in, *reason will always be set, even in error.
  */
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
-	int free_extents;
-	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	enum ocfs2_alloc_restarted reason = RESTART_NONE;
-	u32 bit_off, num_bits;
-	u64 block;
-	u8 flags = 0;
-
-	BUG_ON(!clusters_to_add);
-
-	if (mark_unwritten)
-		flags = OCFS2_EXT_UNWRITTEN;
-
-	free_extents = ocfs2_num_free_extents(osb, inode, fe);
-	if (free_extents < 0) {
-		status = free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	/* there are two cases which could cause us to EAGAIN in the
-	 * we-need-more-metadata case:
-	 * 1) we haven't reserved *any*
-	 * 2) we are so fragmented, we've needed to add metadata too
-	 *    many times. */
-	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	} else if ((!free_extents)
-		   && (ocfs2_alloc_context_bits_left(meta_ac)
-		       < ocfs2_extend_meta_needed(fe))) {
-		mlog(0, "filesystem is really fragmented...\n");
-		status = -EAGAIN;
-		reason = RESTART_META;
-		goto leave;
-	}
-
-	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
-					clusters_to_add, &bit_off, &num_bits);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
-	BUG_ON(num_bits > clusters_to_add);
-
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
-				     *logical_offset, block, num_bits,
-				     flags, meta_ac);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
-	clusters_to_add -= num_bits;
-	*logical_offset += num_bits;
-
-	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, clusters = %u, wanted = "
-		     "%u\n", fe->i_clusters, clusters_to_add);
-		status = -EAGAIN;
-		reason = RESTART_TRANS;
-	}
-
-leave:
-	mlog_exit(status);
-	if (reason_ret)
-		*reason_ret = reason;
-	return status;
-}
-
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac)
-{
-	int ret = 0, num_free_extents;
-	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	*meta_ac = NULL;
-	if (data_ac)
-		*data_ac = NULL;
-
-	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u, extents_to_split = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
-	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
-	if (num_free_extents < 0) {
-		ret = num_free_extents;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/*
-	 * Sparse allocation file systems need to be more conservative
-	 * with reserving room for expansion - the actual allocation
-	 * happens while we've got a journal handle open so re-taking
-	 * a cluster lock (because we ran out of room for another
-	 * extent) will violate ordering rules.
-	 *
-	 * Most of the time we'll only be seeing this 1 cluster at a time
-	 * anyway.
-	 *
-	 * Always lock for any unwritten extents - we might want to
-	 * add blocks during a split.
-	 */
-	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
-		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	if (clusters_to_add == 0)
-		goto out;
-
-	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
-	if (ret < 0) {
-		if (ret != -ENOSPC)
-			mlog_errno(ret);
-		goto out;
-	}
-
-out:
-	if (ret) {
-		if (*meta_ac) {
-			ocfs2_free_alloc_context(*meta_ac);
-			*meta_ac = NULL;
-		}
+	int ret;
+	struct ocfs2_extent_tree et;
 
-		/*
-		 * We cannot have an error and a non null *data_ac.
-		 */
-	}
+	ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
+	ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+					   clusters_to_add, mark_unwritten,
+					   &et, handle,
+					   data_ac, meta_ac, reason_ret);
 
 	return ret;
 }
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
-				  OCFS2_BH_CACHED, inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
-				       &meta_ac);
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+	     clusters_to_add);
+	ocfs2_init_dinode_extent_tree(&et, inode, bh);
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
 	if (status) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+	credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
 
 	prev_clusters = OCFS2_I(inode)->ip_clusters;
 
-	status = ocfs2_do_extend_allocation(osb,
-					    inode,
-					    &logical_start,
-					    clusters_to_add,
-					    mark_unwritten,
-					    bh,
-					    handle,
-					    data_ac,
-					    meta_ac,
-					    &why);
+	status = ocfs2_add_inode_data(osb,
+				      inode,
+				      &logical_start,
+				      clusters_to_add,
+				      mark_unwritten,
+				      bh,
+				      handle,
+				      data_ac,
+				      meta_ac,
+				      &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
 			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
-							    fe,
+							    &fe->id2.i_list,
 							    clusters_to_add);
 			status = ocfs2_extend_trans(handle, credits);
 			if (status < 0) {
@@ -826,10 +670,8 @@ leave:
 		restart_func = 0;
 		goto restart_all;
 	}
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
+	brelse(bh);
+	bh = NULL;
 
 	mlog_exit(status);
 	return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			goto bail_unlock;
 		}
 
-		if (i_size_read(inode) > attr->ia_size)
+		if (i_size_read(inode) > attr->ia_size) {
+			if (ocfs2_should_order_data(inode)) {
+				status = ocfs2_begin_ordered_truncate(inode,
+								      attr->ia_size);
+				if (status)
+					goto bail_unlock;
+			}
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
-		else
+		} else
 			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 	struct buffer_head *bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-			       oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &di_bh,
-				       OCFS2_BH_CACHED, inode);
+		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+				       &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_tree et;
 
-	ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
 		mlog_errno(ret);
@@ -2040,7 +1888,7 @@ out_dio:
 		 */
 		if (old_size != i_size_read(inode) ||
 		    old_clusters != OCFS2_I(inode)->ip_clusters) {
-			ret = journal_force_commit(osb->journal->j_journal);
+			ret = jbd2_journal_force_commit(osb->journal->j_journal);
 			if (ret < 0)
 				written = ret;
 		}
@@ -2227,6 +2075,10 @@ const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 	.fallocate	= ocfs2_fallocate,
 	.fiemap		= ocfs2_fiemap,
 };
@@ -2237,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
 	.permission	= ocfs2_permission,
 };
 
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
 const struct file_operations ocfs2_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -2251,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.lock		= ocfs2_lock,
 	.flock		= ocfs2_flock,
 	.splice_read	= ocfs2_file_splice_read,
 	.splice_write	= ocfs2_file_splice_write,
@@ -2267,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ocfs2_compat_ioctl,
 #endif
+	.lock		= ocfs2_lock,
+	.flock		= ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.mmap		= ocfs2_mmap,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_file_release,
+	.open		= ocfs2_file_open,
+	.aio_read	= ocfs2_file_aio_read,
+	.aio_write	= ocfs2_file_aio_write,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
+	.flock		= ocfs2_flock,
+	.splice_read	= ocfs2_file_splice_read,
+	.splice_write	= ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ocfs2_readdir,
+	.fsync		= ocfs2_sync_file,
+	.release	= ocfs2_dir_release,
+	.open		= ocfs2_dir_open,
+	.unlocked_ioctl	= ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = ocfs2_compat_ioctl,
+#endif
 	.flock		= ocfs2_flock,
 };
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1e27b4d017e..e92382cbca5 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
 
 extern const struct file_operations ocfs2_fops;
 extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
 extern const struct inode_operations ocfs2_file_iops;
 extern const struct inode_operations ocfs2_special_file_iops;
 struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
 
 struct ocfs2_file_private {
 	struct file		*fp_file;
@@ -38,27 +41,18 @@ struct ocfs2_file_private {
 	struct ocfs2_lock_res	fp_flock;
 };
 
-enum ocfs2_alloc_restarted {
-	RESTART_NONE = 0,
-	RESTART_TRANS,
-	RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
-			       struct inode *inode,
-			       u32 *logical_offset,
-			       u32 clusters_to_add,
-			       int mark_unwritten,
-			       struct buffer_head *fe_bh,
-			       handle_t *handle,
-			       struct ocfs2_alloc_context *data_ac,
-			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+			 struct inode *inode,
+			 u32 *logical_offset,
+			 u32 clusters_to_add,
+			 int mark_unwritten,
+			 struct buffer_head *fe_bh,
+			 handle_t *handle,
+			 struct ocfs2_alloc_context *data_ac,
+			 struct ocfs2_alloc_context *meta_ac,
+			 enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add, u32 extents_to_split,
-			  struct ocfs2_alloc_context **data_ac,
-			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec..4903688f72a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct super_block *sb;
 	struct ocfs2_super *osb;
 	int status = -EINVAL;
+	int use_plocks = 1;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
 		   (unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
+	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+		use_plocks = 0;
+
 	/* this means that read_inode cannot create a superblock inode
 	 * today.  change if needed. */
 	if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	switch (inode->i_mode & S_IFMT) {
 	    case S_IFREG:
-		    inode->i_fop = &ocfs2_fops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_fops;
+		    else
+			    inode->i_fop = &ocfs2_fops_no_plocks;
 		    inode->i_op = &ocfs2_file_iops;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFDIR:
 		    inode->i_op = &ocfs2_dir_iops;
-		    inode->i_fop = &ocfs2_dops;
+		    if (use_plocks)
+			    inode->i_fop = &ocfs2_dops;
+		    else
+			    inode->i_fop = &ocfs2_dops_no_plocks;
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
-	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
-				  can_lock ? inode : NULL);
+	if (can_lock)
+		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
+	else
+		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	 * data and fast symlinks.
 	 */
 	if (fe->i_clusters) {
+		if (ocfs2_should_order_data(inode))
+			ocfs2_begin_ordered_truncate(inode, 0);
+
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
 			status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		goto bail_unlock_dir;
 	}
 
+	/*Free extended attribute resources associated with this inode.*/
+	status = ocfs2_xattr_remove(inode, di_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail_unlock_dir;
+	}
+
 	status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
 				    orphan_dir_bh);
 	if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
 	oi->ip_last_trans = 0;
 	oi->ip_dir_start_lookup = 0;
 	oi->ip_blkno = 0ULL;
+	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+				       &oi->ip_jinode);
 
 bail:
 	mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
 }
 
 /*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
-				int block, int *err, int reada)
-{
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = OCFS2_BH_CACHED;
-
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
-
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
-
-	tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
-				  readflags, inode);
-	if (tmperr < 0)
-		goto fail;
-
-	tmperr = 0;
-
-	*err = 0;
-	return bh;
-
-fail:
-	if (bh) {
-		brelse(bh);
-		bh = NULL;
-	}
-	*err = -EIO;
-	return NULL;
-}
-
-/*
  * This is called from our getattr.
  */
 int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa..2f37af9bcc4 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;
 
+	/* protects extended attribute changes on this inode */
+	struct rw_semaphore		ip_xattr_sem;
+
 	/* These fields are protected by ip_lock */
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
 	struct ocfs2_extent_map		ip_extent_map;
 
 	struct inode			vfs_inode;
+	struct jbd2_inode		ip_jinode;
 };
 
 /*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
 
 extern const struct address_space_operations ocfs2_aops;
 
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
-				int *err, int reada);
 void ocfs2_clear_inode(struct inode *inode);
 void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce99..9fcd36dcc9a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
 bail:
 	mutex_unlock(&inode->i_mutex);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c47bc2a809c..81e40677eec 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 		goto finally;
 	}
 
-	journal_lock_updates(journal->j_journal);
-	status = journal_flush(journal->j_journal);
-	journal_unlock_updates(journal->j_journal);
+	jbd2_journal_lock_updates(journal->j_journal);
+	status = jbd2_journal_flush(journal->j_journal);
+	jbd2_journal_unlock_updates(journal->j_journal);
 	if (status < 0) {
 		up_write(&journal->j_trans_barrier);
 		mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 
 	down_read(&osb->journal->j_trans_barrier);
 
-	handle = journal_start(journal, max_buffs);
+	handle = jbd2_journal_start(journal, max_buffs);
 	if (IS_ERR(handle)) {
 		up_read(&osb->journal->j_trans_barrier);
 
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
 
 	BUG_ON(!handle);
 
-	ret = journal_stop(handle);
+	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
  * transaction. extend_trans will either extend the current handle by
  * nblocks, or commit it and start a new one with nblocks credits.
  *
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
  * and then restart the transaction. Before calling
  * ocfs2_extend_trans(), any changed blocks should have been
  * dirtied. After calling it, all blocks which need to be changed must
@@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
 #else
-	status = journal_extend(handle, nblocks);
+	status = jbd2_journal_extend(handle, nblocks);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
 
 	if (status > 0) {
-		mlog(0, "journal_extend failed, trying journal_restart\n");
-		status = journal_restart(handle, nblocks);
+		mlog(0,
+		     "jbd2_journal_extend failed, trying "
+		     "jbd2_journal_restart\n");
+		status = jbd2_journal_restart(handle, nblocks);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
 	switch (type) {
 	case OCFS2_JOURNAL_ACCESS_CREATE:
 	case OCFS2_JOURNAL_ACCESS_WRITE:
-		status = journal_get_write_access(handle, bh);
+		status = jbd2_journal_get_write_access(handle, bh);
 		break;
 
 	case OCFS2_JOURNAL_ACCESS_UNDO:
-		status = journal_get_undo_access(handle, bh);
+		status = jbd2_journal_get_undo_access(handle, bh);
 		break;
 
 	default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	mlog_entry("(bh->b_blocknr=%llu)\n",
 		   (unsigned long long)bh->b_blocknr);
 
-	status = journal_dirty_metadata(handle, bh);
+	status = jbd2_journal_dirty_metadata(handle, bh);
 	if (status < 0)
 		mlog(ML_ERROR, "Could not dirty metadata buffer. "
 		     "(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }
 
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int ocfs2_journal_dirty_data(handle_t *handle,
 			     struct buffer_head *bh)
 {
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
 
 	return err;
 }
+#endif
 
-#define OCFS2_DEFAULT_COMMIT_INTERVAL 	(HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
 {
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
 	spin_lock(&journal->j_state_lock);
 	journal->j_commit_interval = commit_interval;
 	if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-		journal->j_flags |= JFS_BARRIER;
+		journal->j_flags |= JBD2_BARRIER;
 	else
-		journal->j_flags &= ~JFS_BARRIER;
+		journal->j_flags &= ~JBD2_BARRIER;
 	spin_unlock(&journal->j_state_lock);
 }
 
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
 
 	/* call the kernels journal init function now */
-	j_journal = journal_init_inode(inode);
+	j_journal = jbd2_journal_init_inode(inode);
 	if (j_journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EINVAL;
 		goto done;
 	}
 
-	mlog(0, "Returned from journal_init_inode\n");
+	mlog(0, "Returned from jbd2_journal_init_inode\n");
 	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
 	if (status < 0) {
 		if (inode_lock)
 			ocfs2_inode_unlock(inode, 1);
-		if (bh != NULL)
-			brelse(bh);
+		brelse(bh);
 		if (inode) {
 			OCFS2_I(inode)->ip_open_count--;
 			iput(inode);
@@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	if (journal->j_state != OCFS2_JOURNAL_LOADED)
 		goto done;
 
-	/* need to inc inode use count as journal_destroy will iput. */
+	/* need to inc inode use count - jbd2_journal_destroy will iput. */
 	if (!igrab(inode))
 		BUG();
 
@@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
 
 	if (ocfs2_mount_local(osb)) {
-		journal_lock_updates(journal->j_journal);
-		status = journal_flush(journal->j_journal);
-		journal_unlock_updates(journal->j_journal);
+		jbd2_journal_lock_updates(journal->j_journal);
+		status = jbd2_journal_flush(journal->j_journal);
+		jbd2_journal_unlock_updates(journal->j_journal);
 		if (status < 0)
 			mlog_errno(status);
 	}
@@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	}
 
 	/* Shutdown the kernel journal system */
-	journal_destroy(journal->j_journal);
+	jbd2_journal_destroy(journal->j_journal);
 
 	OCFS2_I(inode)->ip_open_count--;
 
@@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
 {
 	int olderr;
 
-	olderr = journal_errno(journal);
+	olderr = jbd2_journal_errno(journal);
 	if (olderr) {
 		mlog(ML_ERROR, "File system error %d recorded in "
 		     "journal %u.\n", olderr, slot);
 		mlog(ML_ERROR, "File system on device %s needs checking.\n",
 		     sb->s_id);
 
-		journal_ack_err(journal);
-		journal_clear_err(journal);
+		jbd2_journal_ack_err(journal);
+		jbd2_journal_clear_err(journal);
 	}
 }
 
@@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 
 	osb = journal->j_osb;
 
-	status = journal_load(journal->j_journal);
+	status = jbd2_journal_load(journal->j_journal);
 	if (status < 0) {
 		mlog(ML_ERROR, "Failed to load journal!\n");
 		goto done;
@@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 
 	BUG_ON(!journal);
 
-	status = journal_wipe(journal->j_journal, full);
+	status = jbd2_journal_wipe(journal->j_journal, full);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 		/* We are reading journal data which should not
 		 * be put in the uptodate cache */
-		status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
-					   p_blkno, p_blocks, bhs, 0,
-					   NULL);
+		status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+						p_blkno, p_blocks, bhs);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
-		if (bhs[i])
-			brelse(bhs[i]);
+		brelse(bhs[i]);
 	mlog_exit(status);
 	return status;
 }
@@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+				   OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	}
 
 	mlog(0, "calling journal_init_inode\n");
-	journal = journal_init_inode(inode);
+	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
 		status = -EIO;
 		goto done;
 	}
 
-	status = journal_load(journal);
+	status = jbd2_journal_load(journal);
 	if (status < 0) {
 		mlog_errno(status);
 		if (!igrab(inode))
 			BUG();
-		journal_destroy(journal);
+		jbd2_journal_destroy(journal);
 		goto done;
 	}
 
@@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	/* wipe the journal */
 	mlog(0, "flushing the journal.\n");
-	journal_lock_updates(journal);
-	status = journal_flush(journal);
-	journal_unlock_updates(journal);
+	jbd2_journal_lock_updates(journal);
+	status = jbd2_journal_flush(journal);
+	jbd2_journal_unlock_updates(journal);
 	if (status < 0)
 		mlog_errno(status);
 
@@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	if (!igrab(inode))
 		BUG();
 
-	journal_destroy(journal);
+	jbd2_journal_destroy(journal);
 
 done:
 	/* drop the lock on this nodes journal */
@@ -1282,8 +1284,7 @@ done:
 	if (inode)
 		iput(inode);
 
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2178ebffa05..d4d14e9a3ce 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
 #define OCFS2_JOURNAL_H
 
 #include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
- *  ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- *                             the current handle commits.
+ *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
+ *                           the current handle commits.
  */
 
 /* You must always start_trans with a number of buffs > 0, but it's
@@ -268,8 +273,10 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
 int                  ocfs2_journal_dirty_data(handle_t *handle,
 					      struct buffer_head *bh);
+#endif
 
 /*
  *  Credit Macros:
@@ -283,6 +290,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* simple file updates like chmod, etc. */
 #define OCFS2_INODE_UPDATE_CREDITS 1
 
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
@@ -340,11 +350,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
 			     + OCFS2_UNLINK_CREDITS)
 
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+					  + OCFS2_INODE_UPDATE_CREDITS \
+					  + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 static inline int ocfs2_calc_extend_credits(struct super_block *sb,
-					    struct ocfs2_dinode *fe,
+					    struct ocfs2_extent_list *root_el,
 					    u32 bits_wanted)
 {
-	int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+	int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
 
 	/* bitmap dinode, group desc. + relinked group. */
 	bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * however many metadata chunks needed * a remaining suballoc
 	 * alloc. */
 	sysfile_bitmap_blocks = 1 +
-		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+		(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
 
 	/* this does not include *new* metadata blocks, which are
-	 * accounted for in sysfile_bitmap_blocks. fe +
+	 * accounted for in sysfile_bitmap_blocks. root_el +
 	 * prev. last_eb_blk + blocks along edge of tree.
 	 * calc_symlink_credits passes because we just need 1
 	 * credit for the dinode there. */
-	dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	return credits;
 }
 
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+					       loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+						   new_size);
+}
+
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 28e492e4ec8..687b28713c3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -47,8 +48,6 @@
 
 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
 
 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 					  struct inode *local_alloc_inode);
 
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#ifdef CONFIG_OCFS2_FS_STATS
+
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+#define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
+#define LA_DEBUG_VER	1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+				   size_t count, loff_t *ppos)
+{
+	static DEFINE_MUTEX(la_debug_mutex);
+	struct ocfs2_super *osb = file->private_data;
+	int written, ret;
+	char *buf = osb->local_alloc_debug_buf;
+
+	mutex_lock(&la_debug_mutex);
+	memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+	written = snprintf(buf, LA_DEBUG_BUF_SZ,
+			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+			   LA_DEBUG_VER,
+			   (unsigned long long)osb->la_last_gd,
+			   osb->local_alloc_default_bits,
+			   osb->local_alloc_bits, osb->local_alloc_state);
+
+	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+	mutex_unlock(&la_debug_mutex);
+	return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+	.open =		ocfs2_la_debug_open,
+	.read =		ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+	if (!osb->local_alloc_debug_buf)
+		return;
+
+	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+						     S_IFREG|S_IRUSR,
+						     osb->osb_debug_root,
+						     osb,
+						     &ocfs2_la_debug_fops);
+	if (!osb->local_alloc_debug) {
+		kfree(osb->local_alloc_debug_buf);
+		osb->local_alloc_debug_buf = NULL;
+	}
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	if (osb->local_alloc_debug)
+		debugfs_remove(osb->local_alloc_debug);
+
+	if (osb->local_alloc_debug_buf)
+		kfree(osb->local_alloc_debug_buf);
+
+	osb->local_alloc_debug_buf = NULL;
+	osb->local_alloc_debug = NULL;
+}
+#else	/* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+	return;
+}
+#endif
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
 {
-	BUG_ON(osb->s_clustersize_bits > 20);
+	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+		osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
 
-	/* Size local alloc windows by the megabyte */
-	return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters)
+{
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
+		if (num_clusters >= osb->local_alloc_default_bits) {
+			cancel_delayed_work(&osb->la_enable_wq);
+			osb->local_alloc_state = OCFS2_LA_ENABLED;
+		}
+	spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+	struct ocfs2_super *osb =
+		container_of(work, struct ocfs2_super,
+			     la_enable_wq.work);
+	spin_lock(&osb->osb_lock);
+	osb->local_alloc_state = OCFS2_LA_ENABLED;
+	spin_unlock(&osb->osb_lock);
 }
 
 /*
  * Tell us whether a given allocation should use the local alloc
  * file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
  */
 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 {
-	int la_bits = ocfs2_local_alloc_window_bits(osb);
 	int ret = 0;
+	int la_bits;
+
+	spin_lock(&osb->osb_lock);
+	la_bits = osb->local_alloc_bits;
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+	if (!ocfs2_la_state_enabled(osb))
 		goto bail;
 
 	/* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 bail:
 	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
 	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	spin_unlock(&osb->osb_lock);
 	return ret;
 }
 
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
-	if (osb->local_alloc_size == 0)
+	ocfs2_init_la_debug(osb);
+
+	if (osb->local_alloc_bits == 0)
 		goto bail;
 
-	if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
 		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
 		     "than max possible %u. Using defaults.\n",
-		     ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
-		osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+		osb->local_alloc_bits =
+			ocfs2_megabytes_to_clusters(osb->sb,
+						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
 	}
 
 	/* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 
 bail:
 	if (status < 0)
-		if (alloc_bh)
-			brelse(alloc_bh);
+		brelse(alloc_bh);
 	if (inode)
 		iput(inode);
 
-	mlog(0, "Local alloc window bits = %d\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	if (status < 0)
+		ocfs2_shutdown_la_debug(osb);
+
+	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
 	mlog_exit(status);
 	return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 
 	mlog_entry_void();
 
+	cancel_delayed_work(&osb->la_enable_wq);
+	flush_workqueue(ocfs2_wq);
+
+	ocfs2_shutdown_la_debug(osb);
+
 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
 		goto out;
 
@@ -295,8 +410,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
-				  &alloc_bh, 0, inode);
+	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -372,8 +486,7 @@ bail:
 		*alloc_copy = NULL;
 	}
 
-	if (alloc_bh)
-		brelse(alloc_bh);
+	brelse(alloc_bh);
 
 	if (inode) {
 		mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
 out_mutex:
 	mutex_unlock(&main_bm_inode->i_mutex);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	iput(main_bm_inode);
 
@@ -453,8 +565,48 @@ out:
 	return status;
 }
 
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+				      struct ocfs2_alloc_context *ac,
+				      u32 bits_wanted)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *alloc;
+	struct ocfs2_local_alloc *la;
+	int start;
+	u64 block_off;
+
+	if (!ac->ac_max_block)
+		return 1;
+
+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+	la = OCFS2_LOCAL_ALLOC(alloc);
+
+	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+	if (start == -1) {
+		mlog_errno(-ENOSPC);
+		return 0;
+	}
+
+	/*
+	 * Converting (bm_off + start + bits_wanted) to blocks gives us
+	 * the blkno just past our actual allocation.  This is perfect
+	 * to compare with ac_max_block.
+	 */
+	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+					     le32_to_cpu(la->la_bm_off) +
+					     start + bits_wanted);
+	mlog(0, "Checking %llu against %llu\n",
+	     (unsigned long long)block_off,
+	     (unsigned long long)ac->ac_max_block);
+	if (block_off > ac->ac_max_block)
+		return 0;
+
+	return 1;
+}
+
 /*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
  * local alloc. You lose them when you drop i_mutex.
  *
  * We will add ourselves to the transaction passed in, but may start
@@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 
 	mutex_lock(&local_alloc_inode->i_mutex);
 
-	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
-		status = -ENOSPC;
-		goto bail;
-	}
-
-	if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
-		mlog(0, "Asking for more than my max window size!\n");
+	/*
+	 * We must double check state and allocator bits because
+	 * another process may have changed them while holding i_mutex.
+	 */
+	spin_lock(&osb->osb_lock);
+	if (!ocfs2_la_state_enabled(osb) ||
+	    (bits_wanted > osb->local_alloc_bits)) {
+		spin_unlock(&osb->osb_lock);
 		status = -ENOSPC;
 		goto bail;
 	}
+	spin_unlock(&osb->osb_lock);
 
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 				mlog_errno(status);
 			goto bail;
 		}
+
+		/*
+		 * Under certain conditions, the window slide code
+		 * might have reduced the number of bits available or
+		 * disabled the the local alloc entirely. Re-check
+		 * here and return -ENOSPC if necessary.
+		 */
+		status = -ENOSPC;
+		if (!ocfs2_la_state_enabled(osb))
+			goto bail;
+
+		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+			le32_to_cpu(alloc->id1.bitmap1.i_used);
+		if (bits_wanted > free_bits)
+			goto bail;
+	}
+
+	if (ac->ac_max_block)
+		mlog(0, "Calling in_range for max block %llu\n",
+		     (unsigned long long)ac->ac_max_block);
+
+	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+					bits_wanted)) {
+		/*
+		 * The window is outside ac->ac_max_block.
+		 * This errno tells the caller to keep localalloc enabled
+		 * but to get the allocation from the main bitmap.
+		 */
+		status = -EFBIG;
+		goto bail;
 	}
 
 	ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
 	return status;
 }
 
+enum ocfs2_la_event {
+	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
+	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
+					 * enough bits theoretically
+					 * free, but a contiguous
+					 * allocation could not be
+					 * found. */
+	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
+					 * enough bits free to satisfy
+					 * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+				  enum ocfs2_la_event event)
+{
+	unsigned int bits;
+	int state;
+
+	spin_lock(&osb->osb_lock);
+	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+		goto out_unlock;
+	}
+
+	/*
+	 * ENOSPC and fragmentation are treated similarly for now.
+	 */
+	if (event == OCFS2_LA_EVENT_ENOSPC ||
+	    event == OCFS2_LA_EVENT_FRAGMENTED) {
+		/*
+		 * We ran out of contiguous space in the primary
+		 * bitmap. Drastically reduce the number of bits used
+		 * by local alloc until we have to disable it.
+		 */
+		bits = osb->local_alloc_bits >> 1;
+		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+			/*
+			 * By setting state to THROTTLED, we'll keep
+			 * the number of local alloc bits used down
+			 * until an event occurs which would give us
+			 * reason to assume the bitmap situation might
+			 * have changed.
+			 */
+			osb->local_alloc_state = OCFS2_LA_THROTTLED;
+			osb->local_alloc_bits = bits;
+		} else {
+			osb->local_alloc_state = OCFS2_LA_DISABLED;
+		}
+		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+				   OCFS2_LA_ENABLE_INTERVAL);
+		goto out_unlock;
+	}
+
+	/*
+	 * Don't increase the size of the local alloc window until we
+	 * know we might be able to fulfill the request. Otherwise, we
+	 * risk bouncing around the global bitmap during periods of
+	 * low space.
+	 */
+	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+		osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+	state = osb->local_alloc_state;
+	spin_unlock(&osb->osb_lock);
+
+	return state;
+}
+
 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 						struct ocfs2_alloc_context **ac,
 						struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+	if (status == -ENOSPC) {
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		ocfs2_free_ac_resource(*ac);
+		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+		goto retry_enospc;
+	}
 	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+		mlog_errno(status);
 		goto bail;
 	}
 
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 		     "one\n");
 
 	mlog(0, "Allocating %u clusters for a new window.\n",
-	     ocfs2_local_alloc_window_bits(osb));
+	     osb->local_alloc_bits);
 
 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	/* we used the generic suballoc reserve function, but we set
 	 * everything up nicely, so there's no reason why we can't use
 	 * the more specific cluster api to claim bits. */
-	status = ocfs2_claim_clusters(osb, handle, ac,
-				      ocfs2_local_alloc_window_bits(osb),
+	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
 				      &cluster_off, &cluster_count);
+	if (status == -ENOSPC) {
+retry_enospc:
+		/*
+		 * Note: We could also try syncing the journal here to
+		 * allow use of any free bits which the current
+		 * transaction can't give us access to. --Mark
+		 */
+		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+		    OCFS2_LA_DISABLED)
+			goto bail;
+
+		status = ocfs2_claim_clusters(osb, handle, ac,
+					      osb->local_alloc_bits,
+					      &cluster_off,
+					      &cluster_count);
+		if (status == -ENOSPC)
+			goto retry_enospc;
+		/*
+		 * We only shrunk the *minimum* number of in our
+		 * request - it's entirely possible that the allocator
+		 * might give us more than we asked for.
+		 */
+		if (status == 0) {
+			spin_lock(&osb->osb_lock);
+			osb->local_alloc_bits = cluster_count;
+			spin_unlock(&osb->osb_lock);
+		}
+	}
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
 	/* This will lock the main bitmap for us. */
 	status = ocfs2_local_alloc_reserve_for_window(osb,
 						      &ac,
@@ -976,8 +1277,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (main_bm_bh)
-		brelse(main_bm_bh);
+	brelse(main_bm_bh);
 
 	if (main_bm_inode)
 		iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110..ac5ea9f8665 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 				 u32 *bit_off,
 				 u32 *num_bits);
 
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+				      unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
 #endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f8714387..544ac624517 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/fcntl.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -32,6 +33,7 @@
 
 #include "dlmglue.h"
 #include "file.h"
+#include "inode.h"
 #include "locks.h"
 
 static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	else
 		return ocfs2_do_flock(file, inode, cmd, fl);
 }
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(inode))
+		return -ENOLCK;
+
+	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324e..496d488b271 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
 #define OCFS2_LOCKS_H
 
 int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
 
 #endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe014..485a6aa0ad3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -327,14 +328,9 @@ leave:
 	if (status == -ENOSPC)
 		mlog(0, "Disk is full\n");
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-
-	if (de_bh)
-		brelse(de_bh);
-
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(new_fe_bh);
+	brelse(de_bh);
+	brelse(parent_fe_bh);
 
 	if ((status < 0) && inode)
 		iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
 out:
 	ocfs2_inode_unlock(dir, 1);
 
-	if (de_bh)
-		brelse(de_bh);
-	if (fe_bh)
-		brelse(fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
+	brelse(de_bh);
+	brelse(fe_bh);
+	brelse(parent_fe_bh);
 
 	mlog_exit(err);
 
@@ -851,17 +844,10 @@ leave:
 		iput(orphan_dir);
 	}
 
-	if (fe_bh)
-		brelse(fe_bh);
-
-	if (dirent_bh)
-		brelse(dirent_bh);
-
-	if (parent_node_bh)
-		brelse(parent_node_bh);
-
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
+	brelse(fe_bh);
+	brelse(dirent_bh);
+	brelse(parent_node_bh);
+	brelse(orphan_entry_bh);
 
 	mlog_exit(status);
 
@@ -1372,24 +1358,15 @@ bail:
 
 	if (new_inode)
 		iput(new_inode);
-	if (newfe_bh)
-		brelse(newfe_bh);
-	if (old_inode_bh)
-		brelse(old_inode_bh);
-	if (old_dir_bh)
-		brelse(old_dir_bh);
-	if (new_dir_bh)
-		brelse(new_dir_bh);
-	if (new_de_bh)
-		brelse(new_de_bh);
-	if (old_de_bh)
-		brelse(old_de_bh);
-	if (old_inode_de_bh)
-		brelse(old_inode_de_bh);
-	if (orphan_entry_bh)
-		brelse(orphan_entry_bh);
-	if (insert_entry_bh)
-		brelse(insert_entry_bh);
+	brelse(newfe_bh);
+	brelse(old_inode_bh);
+	brelse(old_dir_bh);
+	brelse(new_dir_bh);
+	brelse(new_de_bh);
+	brelse(old_de_bh);
+	brelse(old_inode_de_bh);
+	brelse(orphan_entry_bh);
+	brelse(insert_entry_bh);
 
 	mlog_exit(status);
 
@@ -1492,8 +1469,7 @@ bail:
 
 	if (bhs) {
 		for(i = 0; i < blocks; i++)
-			if (bhs[i])
-				brelse(bhs[i]);
+			brelse(bhs[i]);
 		kfree(bhs);
 	}
 
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
-						    new_fe_bh,
-						    handle, data_ac, NULL,
-						    NULL);
+		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+					      new_fe_bh,
+					      handle, data_ac, NULL,
+					      NULL);
 		if (status < 0) {
 			if (status != -ENOSPC && status != -EINTR) {
 				mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
 
 	ocfs2_inode_unlock(dir, 1);
 
-	if (new_fe_bh)
-		brelse(new_fe_bh);
-	if (parent_fe_bh)
-		brelse(parent_fe_bh);
-	if (de_bh)
-		brelse(de_bh);
+	brelse(new_fe_bh);
+	brelse(parent_fe_bh);
+	brelse(de_bh);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
 		iput(orphan_dir_inode);
 	}
 
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	status = ocfs2_read_block(osb,
+	status = ocfs2_read_block(orphan_dir_inode,
 				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh, OCFS2_BH_CACHED,
-				  orphan_dir_inode);
+				  &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
 leave:
-	if (orphan_dir_bh)
-		brelse(orphan_dir_bh);
+	brelse(orphan_dir_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	}
 
 leave:
-	if (target_de_bh)
-		brelse(target_de_bh);
+	brelse(target_de_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
 	.permission	= ocfs2_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7f625f2b111..a21a465490c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
 #include <linux/workqueue.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
 
 /* For union ocfs2_dlm_lksb */
 #include "stackglue.h"
@@ -171,9 +176,13 @@ struct ocfs2_alloc_stats
 
 enum ocfs2_local_alloc_state
 {
-	OCFS2_LA_UNUSED = 0,
-	OCFS2_LA_ENABLED,
-	OCFS2_LA_DISABLED
+	OCFS2_LA_UNUSED = 0,	/* Local alloc will never be used for
+				 * this mountpoint. */
+	OCFS2_LA_ENABLED,	/* Local alloc is in use. */
+	OCFS2_LA_THROTTLED,	/* Local alloc is in use, but number
+				 * of bits has been reduced. */
+	OCFS2_LA_DISABLED	/* Local alloc has temporarily been
+				 * disabled. */
 };
 
 enum ocfs2_mount_options
@@ -184,6 +193,8 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
 	OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
@@ -214,6 +225,7 @@ struct ocfs2_super
 	u32 bitmap_cpg;
 	u8 *uuid;
 	char *uuid_str;
+	u32 uuid_hash;
 	u8 *vol_label;
 	u64 first_cluster_group_blkno;
 	u32 fs_generation;
@@ -241,6 +253,7 @@ struct ocfs2_super
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
+	unsigned int s_xattr_inline_size;
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
@@ -252,11 +265,27 @@ struct ocfs2_super
 	struct ocfs2_journal *journal;
 	unsigned long osb_commit_interval;
 
-	int local_alloc_size;
-	enum ocfs2_local_alloc_state local_alloc_state;
+	struct delayed_work		la_enable_wq;
+
+	/*
+	 * Must hold local alloc i_mutex and osb->osb_lock to change
+	 * local_alloc_bits. Reads can be done under either lock.
+	 */
+	unsigned int local_alloc_bits;
+	unsigned int local_alloc_default_bits;
+
+	enum ocfs2_local_alloc_state local_alloc_state; /* protected
+							 * by osb_lock */
+
 	struct buffer_head *local_alloc_bh;
+
 	u64 la_last_gd;
 
+#ifdef CONFIG_OCFS2_FS_STATS
+	struct dentry *local_alloc_debug;
+	char *local_alloc_debug_buf;
+#endif
+
 	/* Next two fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
@@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	return pages_per_cluster;
 }
 
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+						       unsigned int megs)
+{
+	BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+	return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
 static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
 {
 	spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 4f619850ccf..f24ce3d3f95 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
 #define OCFS2_INODE_SIGNATURE		"INODE01"
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -90,7 +91,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
-					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+					 | OCFS2_FEATURE_INCOMPAT_XATTR)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -127,10 +129,6 @@
 /* Support for data packed into inode blocks */
 #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
 
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
 /*
  * Support for alternate, userspace cluster stacks.  If set, the superblock
  * field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK	0x0080
 
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
  */
 #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE	8
 
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE     256
+
 struct ocfs2_system_inode_info {
 	char	*si_name;
 	int	si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
 /*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
 					   before tunefs required */
 	__le16 s_tunefs_flag;
-	__le32 s_reserved1;
+	__le32 s_uuid_hash;		/* hash value of uuid */
 	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
 /*A0*/  struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
 						     stack.  Only valid
 						     with INCOMPAT flag. */
-/*B8*/  __le64 s_reserved2[17];		/* Fill out superblock */
+/*B8*/	__le16 s_xattr_inline_size;	/* extended attribute inline size
+					   for this fs*/
+	__le16 s_reserved0;
+	__le32 s_reserved1;
+/*C0*/  __le64 s_reserved2[16];		/* Fill out superblock */
 /*140*/
 
 	/*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
 					   belongs to */
 	__le16 i_suballoc_bit;		/* Bit offset in suballocator
 					   block group */
-/*10*/	__le32 i_reserved0;
+/*10*/	__le16 i_reserved0;
+	__le16 i_xattr_inline_size;
 	__le32 i_clusters;		/* Cluster count */
 	__le32 i_uid;			/* Owner UID */
 	__le32 i_gid;			/* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
 	__le32 i_atime_nsec;
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
-	__le32 i_attr;
+/*70*/	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
 	__le16 i_dyn_features;
-/*70*/	__le64 i_reserved2[8];
+	__le64 i_xattr_loc;
+/*80*/	__le64 i_reserved2[7];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -715,6 +731,136 @@ struct ocfs2_group_desc
 /*40*/	__u8    bg_bitmap[0];
 };
 
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+	__le32	xe_name_hash;    /* hash value of xattr prefix+suffix. */
+	__le16	xe_name_offset;  /* byte offset from the 1st etnry in the local
+				    local xattr storage(inode, xattr block or
+				    xattr bucket). */
+	__u8	xe_name_len;	 /* xattr name len, does't include prefix. */
+	__u8	xe_type;         /* the low 7 bits indicates the name prefix's
+				  * type and the highest 1 bits indicate whether
+				  * the EA is stored in the local storage. */
+	__le64	xe_value_size;	 /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+	__le16	xh_count;                       /* contains the count of how
+						   many records are in the
+						   local xattr storage. */
+	__le16	xh_free_start;                  /* current offset for storing
+						   xattr. */
+	__le16	xh_name_value_len;              /* total length of name/value
+						   length in this bucket. */
+	__le16	xh_num_buckets;                 /* bucket nums in one extent
+						   record, only valid in the
+						   first bucket. */
+	__le64  xh_csum;
+	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/	__le32	xr_clusters;              /* clusters covered by xattr value. */
+	__le32	xr_reserved0;
+	__le64	xr_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/	__le32	xt_clusters;              /* clusters covered by xattr. */
+	__le32	xt_reserved0;
+	__le64	xt_last_eb_blk;           /* Pointer to last extent block */
+/*10*/	struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED	0x1
+#define OCFS2_HASH_SHIFT	5
+#define OCFS2_XATTR_ROUND	3
+#define OCFS2_XATTR_SIZE(size)	(((size) + OCFS2_XATTR_ROUND) & \
+				~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE			4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET 	(OCFS2_XATTR_BUCKET_SIZE \
+						 / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/	__u8	xb_signature[8];     /* Signature for verification */
+	__le16	xb_suballoc_slot;    /* Slot suballocator this
+					block belongs to. */
+	__le16	xb_suballoc_bit;     /* Bit offset in suballocator
+					block group */
+	__le32	xb_fs_generation;    /* Must match super block */
+/*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
+	__le64	xb_csum;
+/*20*/	__le16	xb_flags;            /* Indicates whether this block contains
+					real xattr or a xattr tree. */
+	__le16	xb_reserved0;
+	__le32  xb_reserved1;
+	__le64	xb_reserved2;
+/*30*/	union {
+		struct ocfs2_xattr_header xb_header; /* xattr header if this
+							block contains xattr */
+		struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+							block cotains xattr
+							tree. */
+	} xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL		0x80
+#define OCFS2_XATTR_TYPE_MASK		0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+					 int local)
+{
+	if (local)
+		xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+	else
+		xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+	xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
@@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
 		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
 }
 
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+						   struct ocfs2_dinode *di)
+{
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+			xattrsize;
+	else
+		return sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 	return size / sizeof(struct ocfs2_extent_rec);
 }
 
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+						struct super_block *sb,
+						struct ocfs2_dinode *di)
+{
+	int size;
+	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+			xattrsize;
+	else
+		size = sb->s_blocksize -
+			offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
+
 static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
 	return 0;
 
 }
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+	int size;
+
+	size = sb->s_blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #else
 static inline int ocfs2_fast_symlink_chars(int blocksize)
 {
@@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
 
 	return 0;
 }
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+	int size;
+
+	size = blocksize -
+		offsetof(struct ocfs2_xattr_block,
+			 xb_attrs.xb_root.xt_list.l_recs);
+
+	return size / sizeof(struct ocfs2_extent_rec);
+}
 #endif  /* __KERNEL__ */
 
 
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 00000000000..b91c78f8f55
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+	unsigned int dummy;
+};
+
+#define JBD2_BARRIER			JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err			journal_ack_err
+#define jbd2_journal_clear_err			journal_clear_err
+#define jbd2_journal_destroy			journal_destroy
+#define jbd2_journal_dirty_metadata		journal_dirty_metadata
+#define jbd2_journal_errno			journal_errno
+#define jbd2_journal_extend			journal_extend
+#define jbd2_journal_flush			journal_flush
+#define jbd2_journal_force_commit		journal_force_commit
+#define jbd2_journal_get_write_access		journal_get_write_access
+#define jbd2_journal_get_undo_access		journal_get_undo_access
+#define jbd2_journal_init_inode			journal_init_inode
+#define jbd2_journal_invalidatepage		journal_invalidatepage
+#define jbd2_journal_load			journal_load
+#define jbd2_journal_lock_updates		journal_lock_updates
+#define jbd2_journal_restart			journal_restart
+#define jbd2_journal_start			journal_start
+#define jbd2_journal_start_commit		journal_start_commit
+#define jbd2_journal_stop			journal_stop
+#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates		journal_unlock_updates
+#define jbd2_journal_wipe			journal_wipe
+#define jbd2_log_wait_commit			log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+					  struct jbd2_inode *inode)
+{
+	return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+						      loff_t new_size)
+{
+	return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+					       struct inode *inode)
+{
+	return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+						  struct jbd2_inode *jinode)
+{
+	return;
+}
+
+
+#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e901..ffd48db229a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
 		if (cluster > clusters)
 			break;
 
-		ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+		ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
 		if (ret < 0) {
 			mlog_errno(ret);
 			break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
 	 * update the superblock last.
 	 * It doesn't matter if the write failed.
 	 */
-	ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
-			       &super_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+				     &super_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
-			       main_bm_inode);
+	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_unlock;
 	}
 
-	ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+	ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
 	if (ret < 0) {
 		mlog(ML_ERROR, "Can't read the group descriptor # %llu "
 		     "from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf..bdda2d8f850 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * be !NULL.  Thus, ocfs2_read_blocks() will ignore blocknr.  If
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
-	ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
-				si->si_inode);
+	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+				OCFS2_BH_IGNORE_CACHE);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 		     (unsigned long long)blkno);
 
 		bh = NULL;  /* Acquire a fresh bh */
-		status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+					   OCFS2_BH_IGNORE_CACHE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 353fc35c674..faec2d87935 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -28,6 +28,7 @@
 #include "ocfs2.h"  /* For struct ocfs2_lock_res */
 #include "stackglue.h"
 
+#include <linux/dlm_plock.h>
 
 /*
  * The control protocol starts with a handshake.  Until the handshake
@@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 {
 }
 
+static int user_plock(struct ocfs2_cluster_connection *conn,
+		      u64 ino,
+		      struct file *file,
+		      int cmd,
+		      struct file_lock *fl)
+{
+	/*
+	 * This more or less just demuxes the plock request into any
+	 * one of three dlm calls.
+	 *
+	 * Internally, fs/dlm will pass these to a misc device, which
+	 * a userspace daemon will read and write to.
+	 *
+	 * For now, cancel requests (which happen internally only),
+	 * are turned into unlocks. Most of this function taken from
+	 * gfs2_lock.
+	 */
+
+	if (cmd == F_CANCELLK) {
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+	else
+		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
 /*
  * Compare a requested locking protocol version against the current one.
  *
@@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.dlm_unlock	= user_dlm_unlock,
 	.lock_status	= user_dlm_lock_status,
 	.lock_lvb	= user_dlm_lvb,
+	.plock		= user_plock,
 	.dump_lksb	= user_dlm_dump_lksb,
 };
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 07f348b8d72..68b668b0e60 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
 }
 EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
 
+int ocfs2_stack_supports_plocks(void)
+{
+	return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl)
+{
+	WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+	if (active_stack->sp_ops->plock)
+		return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
 int ocfs2_cluster_connect(const char *stack_name,
 			  const char *group,
 			  int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1b..c571af375ef 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
 #include "dlm/dlmapi.h"
 #include <linux/dlm.h>
 
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
 /*
  * dlmconstants.h does not have a LOCAL flag.  We hope to remove it
  * some day, but right now we need it.  Let's fake it.  This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
 	void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
 
 	/*
+	 * Cluster-aware posix locks
+	 *
+	 * This is NULL for stacks which do not support posix locks.
+	 */
+	int (*plock)(struct ocfs2_cluster_connection *conn,
+		     u64 ino,
+		     struct file *file,
+		     int cmd,
+		     struct file_lock *fl);
+
+	/*
 	 * This is an optoinal debugging hook.  If provided, the
 	 * stack can dump debugging information about this lock.
 	 */
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
 void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
 void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
 
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+		struct file *file, int cmd, struct file_lock *fl);
+
 void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
 
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb981..c5ff18b46b5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
 				  struct ocfs2_chain_list *cl);
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh);
+				   struct buffer_head *bh,
+				   u64 max_block);
 
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found);
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found);
 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 				     struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
 						u64 data_blkno,
 						u64 *bg_blkno,
 						u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac);
 
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 {
 	struct inode *inode = ac->ac_inode;
 
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		iput(inode);
 		ac->ac_inode = NULL;
 	}
-	if (ac->ac_bh) {
-		brelse(ac->ac_bh);
-		ac->ac_bh = NULL;
-	}
+	brelse(ac->ac_bh);
+	ac->ac_bh = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
  */
 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 				   struct inode *alloc_inode,
-				   struct buffer_head *bh)
+				   struct buffer_head *bh,
+				   u64 max_block)
 {
 	int status, credits;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	mlog_entry_void();
 
 	cl = &fe->id2.i_chain;
-	status = ocfs2_reserve_clusters(osb,
-					le16_to_cpu(cl->cl_cpg),
-					&ac);
+	status = ocfs2_reserve_clusters_with_limit(osb,
+						   le16_to_cpu(cl->cl_cpg),
+						   max_block, &ac);
 	if (status < 0) {
 		if (status != -ENOSPC)
 			mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
 	if (ac)
 		ocfs2_free_alloc_context(ac);
 
-	if (bg_bh)
-		brelse(bg_bh);
+	brelse(bg_bh);
 
 	mlog_exit(status);
 	return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+						 ac->ac_max_block);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	get_bh(bh);
 	ac->ac_bh = bh;
 bail:
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	mlog_exit(status);
 	return status;
 }
 
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
-			       struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac)
 {
 	int status;
 	u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+	(*ac)->ac_bits_wanted = blocks;
 	(*ac)->ac_which = OCFS2_AC_USE_META;
 	slot = osb->slot_num;
 	(*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+			       struct ocfs2_extent_list *root_el,
+			       struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_new_metadata_blocks(osb,
+					ocfs2_extend_meta_needed(root_el),
+					ac);
+}
+
 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
 					      struct ocfs2_alloc_context *ac)
 {
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 	(*ac)->ac_group_search = ocfs2_block_group_search;
 
 	/*
+	 * stat(2) can't handle i_ino > 32bits, so we tell the
+	 * lower levels not to allocate us a block group past that
+	 * limit.  The 'inode64' mount option avoids this behavior.
+	 */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+		(*ac)->ac_max_block = (u32)~0U;
+
+	/*
 	 * slot is set when we successfully steal inode from other nodes.
 	 * It is reset in 3 places:
 	 * 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
 /* Callers don't need to care which bitmap (local alloc or main) to
  * use so we figure it out for them, but unfortunately this clutters
  * things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
-			   u32 bits_wanted,
-			   struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+					     u32 bits_wanted, u64 max_block,
+					     struct ocfs2_alloc_context **ac)
 {
 	int status;
 
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
 	}
 
 	(*ac)->ac_bits_wanted = bits_wanted;
+	(*ac)->ac_max_block = max_block;
 
 	status = -ENOSPC;
 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
 		status = ocfs2_reserve_local_alloc_bits(osb,
 							bits_wanted,
 							*ac);
-		if ((status < 0) && (status != -ENOSPC)) {
+		if (status == -EFBIG) {
+			/* The local alloc window is outside ac_max_block.
+			 * use the main bitmap. */
+			status = -ENOSPC;
+		} else if ((status < 0) && (status != -ENOSPC)) {
 			mlog_errno(status);
 			goto bail;
-		} else if (status == -ENOSPC) {
-			/* reserve_local_bits will return enospc with
-			 * the local alloc inode still locked, so we
-			 * can change this safely here. */
-			mlog(0, "Disabling local alloc\n");
-			/* We set to OCFS2_LA_DISABLED so that umount
-			 * can clean up what's left of the local
-			 * allocation */
-			osb->local_alloc_state = OCFS2_LA_DISABLED;
 		}
 	}
 
@@ -718,6 +735,13 @@ bail:
 	return status;
 }
 
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+			   u32 bits_wanted,
+			   struct ocfs2_alloc_context **ac)
+{
+	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
 /*
  * More or less lifted from ext3. I'll leave their description below:
  *
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
 static int ocfs2_cluster_group_search(struct inode *inode,
 				      struct buffer_head *group_bh,
 				      u32 bits_wanted, u32 min_bits,
+				      u64 max_block,
 				      u16 *bit_off, u16 *bits_found)
 {
 	int search = -ENOSPC;
 	int ret;
+	u64 blkoff;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 tmp_off, tmp_found;
 	unsigned int max_bits, gd_cluster_off;
 
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		if (ret)
 			return ret;
 
+		if (max_block) {
+			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+							  gd_cluster_off +
+							  tmp_off + tmp_found);
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				return -ENOSPC;
+		}
+
 		/* ocfs2_block_group_find_clear_bits() might
 		 * return success, but we still want to return
 		 * -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 			*bit_off = tmp_off;
 			*bits_found = tmp_found;
 			search = 0; /* success */
+		} else if (tmp_found) {
+			/*
+			 * Don't show bits which we'll be returning
+			 * for allocation to the local alloc bitmap.
+			 */
+			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
 		}
 	}
 
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 static int ocfs2_block_group_search(struct inode *inode,
 				    struct buffer_head *group_bh,
 				    u32 bits_wanted, u32 min_bits,
+				    u64 max_block,
 				    u16 *bit_off, u16 *bits_found)
 {
 	int ret = -ENOSPC;
+	u64 blkoff;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
 
 	BUG_ON(min_bits != 1);
 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
 
-	if (bg->bg_free_bits_count)
+	if (bg->bg_free_bits_count) {
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
 							group_bh, bits_wanted,
 							le16_to_cpu(bg->bg_bits),
 							bit_off, bits_found);
+		if (!ret && max_block) {
+			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+				*bits_found;
+			mlog(0, "Checking %llu against %llu\n",
+			     (unsigned long long)blkoff,
+			     (unsigned long long)max_block);
+			if (blkoff > max_block)
+				ret = -ENOSPC;
+		}
+	}
 
 	return ret;
 }
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *gd;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
-			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
+	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	}
 
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
-				  bit_off, &found);
+				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
 			mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
-	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+	status = ocfs2_read_block(alloc_inode,
 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
+				  &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	/* for now, the chain search is a bit simplistic. We just use
 	 * the 1st group with any empty bits. */
 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
-					     bits_wanted, min_bits, bit_off,
+					     bits_wanted, min_bits,
+					     ac->ac_max_block, bit_off,
 					     &tmp_bits)) == -ENOSPC) {
 		if (!bg->bg_next_group)
 			break;
 
-		if (prev_group_bh) {
-			brelse(prev_group_bh);
-			prev_group_bh = NULL;
-		}
+		brelse(prev_group_bh);
+		prev_group_bh = NULL;
+
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
-					  next_group, &group_bh,
-					  OCFS2_BH_CACHED, alloc_inode);
+		status = ocfs2_read_block(alloc_inode,
+					  next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
 bail:
-	if (group_bh)
-		brelse(group_bh);
-	if (prev_group_bh)
-		brelse(prev_group_bh);
+	brelse(group_bh);
+	brelse(prev_group_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 {
 	int status = 0;
 	u32 tmp_used;
-	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
 	struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
-				  alloc_inode);
+	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	}
 
 bail:
-	if (group_bh)
-		brelse(group_bh);
+	brelse(group_bh);
 
 	mlog_exit(status);
 	return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					  bg_start_bit, bg_blkno,
 					  num_clusters);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		goto out;
+	}
 
+	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+					 num_clusters);
+
+out:
 	mlog_exit(status);
 	return status;
 }
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
 	}
 }
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+			  struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret = 0, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	if (data_ac)
+		*data_ac = NULL;
+
+	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_to_add == 0)
+		goto out;
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662b..4df159d8f45 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
 
 typedef int (group_search_t)(struct inode *,
 			     struct buffer_head *,
-			     u32,
-			     u32,
-			     u16 *,
-			     u16 *);
+			     u32,			/* bits_wanted */
+			     u32,			/* min_bits */
+			     u64,			/* max_block */
+			     u16 *,			/* *bit_off */
+			     u16 *);			/* *bits_found */
 
 struct ocfs2_alloc_context {
 	struct inode *ac_inode;    /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
 	group_search_t *ac_group_search;
 
 	u64    ac_last_group;
+	u64    ac_max_block;  /* Highest block number to allocate. 0 is
+				 is the same as ~0 - unlimited */
 };
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
 	return ac->ac_bits_wanted - ac->ac_bits_given;
 }
 
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
-			       struct ocfs2_dinode *fe,
+			       struct ocfs2_extent_list *root_el,
 			       struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+				      int blocks,
+				      struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 			    struct ocfs2_alloc_context **ac);
 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
  * apis above. */
 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
 				      struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 
 /* given a cluster offset, calculate which block group it belongs to
  * and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct ocfs2_dinode *di,
 				 struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+			  u32 clusters_to_add, u32 extents_to_split,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 #endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 70334d85aff..304b63ac78c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "ver.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -154,6 +155,9 @@ enum {
 	Opt_localalloc,
 	Opt_localflocks,
 	Opt_stack,
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_inode64,
 	Opt_err,
 };
 
@@ -173,6 +177,9 @@ static const match_table_t tokens = {
 	{Opt_localalloc, "localalloc=%d"},
 	{Opt_localflocks, "localflocks"},
 	{Opt_stack, "cluster_stack=%s"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_inode64, "inode64"},
 	{Opt_err, NULL}
 };
 
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 	}
 
-	if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+	if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+				      &target)) {
 		if (wait)
-			log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
-					target);
+			jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+					     target);
 	}
 	return 0;
 }
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
 	if (!oi)
 		return NULL;
 
+	jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
 	return &oi->vfs_inode;
 }
 
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		goto out;
 	}
 
+	/* Probably don't want this on remount; it might
+	 * mess with other nodes */
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+	    (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+		ret = -EINVAL;
+		mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+		goto out;
+	}
+
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		/* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
 	osb->osb_commit_interval = parsed_options.commit_interval;
-	osb->local_alloc_size = parsed_options.localalloc_opt;
+	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+	osb->local_alloc_bits = osb->local_alloc_default_bits;
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	return status;
 
 read_super_error:
-	if (bh != NULL)
-		brelse(bh);
+	brelse(bh);
 
 	if (inode)
 		iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_data_writeback:
 			mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
 			break;
+		case Opt_user_xattr:
+			mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+			break;
+		case Opt_nouser_xattr:
+			mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+			break;
 		case Opt_atime_quantum:
 			if (match_int(&args[0], &option)) {
 				status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			if (option < 0)
 				return 0;
 			if (option == 0)
-				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
 			break;
+		case Opt_inode64:
+			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+			break;
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
 	unsigned long opts = osb->s_mount_opt;
+	unsigned int local_alloc_megs;
 
 	if (opts & OCFS2_MOUNT_HB_LOCAL)
 		seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",commit=%u",
 			   (unsigned) (osb->osb_commit_interval / HZ));
 
-	if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
-		seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+	local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+	if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+		seq_printf(s, ",localalloc=%d", local_alloc_megs);
 
 	if (opts & OCFS2_MOUNT_LOCALFLOCKS)
 		seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
 
+	if (opts & OCFS2_MOUNT_NOUSERXATTR)
+		seq_printf(s, ",nouser_xattr");
+	else
+		seq_printf(s, ",user_xattr");
+
+	if (opts & OCFS2_MOUNT_INODE64)
+		seq_printf(s, ",inode64");
+
 	return 0;
 }
 
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data)
 	oi->ip_dir_start_lookup = 0;
 
 	init_rwsem(&oi->ip_alloc_sem);
+	init_rwsem(&oi->ip_xattr_sem);
 	mutex_init(&oi->ip_io_mutex);
 
 	oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
 	/* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
 
 	osb->slot_num = OCFS2_INVALID_SLOT;
 
+	osb->s_xattr_inline_size = le16_to_cpu(
+					di->id2.i_super.s_xattr_inline_size);
+
 	osb->local_alloc_state = OCFS2_LA_UNUSED;
 	osb->local_alloc_bh = NULL;
+	INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
 
 	init_waitqueue_head(&osb->osb_mount_event);
 
@@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->first_cluster_group_blkno =
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
 	mlog(0, "vol_label: %s\n", osb->vol_label);
 	mlog(0, "uuid: %s\n", osb->uuid_str);
 	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25..cbd03dfdc7b 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "symlink.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				  OCFS2_I(inode)->ip_blkno,
-				  bh,
-				  OCFS2_BH_CACHED,
-				  inode);
+	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
 		kunmap(page);
 		page_cache_release(page);
 	}
-	if (bh)
-		brelse(bh);
+	brelse(bh);
 
 	return ERR_PTR(status);
 }
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
 	.readlink	= ocfs2_readlink,
 	.follow_link	= ocfs2_follow_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ocfs2_listxattr,
+	.removexattr	= generic_removexattr,
 };
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b2..187b99ff036 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
 #include <linux/highmem.h>
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
 
 #define MLOG_MASK_PREFIX ML_UPTODATE
 
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 	ci->ci_num_cached--;
 }
 
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
-			     struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+					  sector_t block)
 {
 	int index;
-	sector_t block = bh->b_blocknr;
 	struct ocfs2_meta_cache_item *item = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
 }
 
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+			     struct buffer_head *bh)
+{
+	sector_t block = bh->b_blocknr;
+
+	ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len)
+{
+	unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+	for (i = 0; i < b_len; i++, block++)
+		ocfs2_remove_block_from_cache(inode, block);
+}
+
 int __init init_ocfs2_uptodate_cache(void)
 {
 	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a..531b4b3a0c4 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
 			     struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+					    sector_t block,
+					    u32 c_len);
 int ocfs2_buffer_read_ahead(struct inode *inode,
 			    struct buffer_head *bh);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 00000000000..c25780a70df
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4834 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+
+
+struct ocfs2_xattr_def_value_root {
+	struct ocfs2_xattr_value_root	xv;
+	struct ocfs2_extent_rec		er;
+};
+
+struct ocfs2_xattr_bucket {
+	struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+	struct ocfs2_xattr_header *xh;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE	80
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+	.xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+	&ocfs2_xattr_user_handler,
+	&ocfs2_xattr_trusted_handler,
+	NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+};
+
+struct ocfs2_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+	struct buffer_head *inode_bh;
+	/*
+	 * xattr_bh point to the block buffer head which has extended attribute
+	 * when extended attribute in inode, xattr_bh is equal to inode_bh.
+	 */
+	struct buffer_head *xattr_bh;
+	struct ocfs2_xattr_header *header;
+	struct ocfs2_xattr_bucket bucket;
+	void *base;
+	void *end;
+	struct ocfs2_xattr_entry *here;
+	int not_found;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset);
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					struct ocfs2_xattr_tree_root *xt,
+					char *buffer,
+					size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs);
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh);
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+		handler = ocfs2_xattr_handler_map[name_index];
+
+	return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+				 const char *name,
+				 int name_len)
+{
+	/* Get hash value of uuid from super block */
+	u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+	int i;
+
+	/* hash extended attribute name */
+	for (i = 0; i < name_len; i++) {
+		hash = (hash << OCFS2_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+				   struct ocfs2_xattr_header *header,
+				   struct ocfs2_xattr_entry *entry)
+{
+	u32 hash = 0;
+	char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+
+	hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
+	entry->xe_name_hash = cpu_to_le32(hash);
+
+	return;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+					 u32 clusters_to_add,
+					 struct buffer_head *xattr_bh,
+					 struct ocfs2_xattr_value_root *xv)
+{
+	int status = 0;
+	int restart_func = 0;
+	int credits = 0;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	enum ocfs2_alloc_restarted why;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_extent_tree et;
+
+	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+
+restart_all:
+
+	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				       &data_ac, &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(status);
+		goto leave;
+	}
+
+restarted_transaction:
+	status = ocfs2_journal_access(handle, inode, xattr_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	prev_clusters = le32_to_cpu(xv->xr_clusters);
+	status = ocfs2_add_clusters_in_btree(osb,
+					     inode,
+					     &logical_start,
+					     clusters_to_add,
+					     0,
+					     &et,
+					     handle,
+					     data_ac,
+					     meta_ac,
+					     &why);
+	if ((status < 0) && (status != -EAGAIN)) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	status = ocfs2_journal_dirty(handle, xattr_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
+	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+
+	if (why != RESTART_NONE && clusters_to_add) {
+		if (why == RESTART_META) {
+			mlog(0, "restarting function.\n");
+			restart_func = 1;
+		} else {
+			BUG_ON(why != RESTART_TRANS);
+
+			mlog(0, "restarting transaction.\n");
+			/* TODO: This can be more intelligent. */
+			credits = ocfs2_calc_extend_credits(osb->sb,
+							    et.et_root_el,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				/* handle still has to be committed at
+				 * this point. */
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto leave;
+			}
+			goto restarted_transaction;
+		}
+	}
+
+leave:
+	if (handle) {
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
+	}
+	if (data_ac) {
+		ocfs2_free_alloc_context(data_ac);
+		data_ac = NULL;
+	}
+	if (meta_ac) {
+		ocfs2_free_alloc_context(meta_ac);
+		meta_ac = NULL;
+	}
+	if ((!status) && restart_func) {
+		restart_func = 0;
+		goto restart_all;
+	}
+
+	return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      u32 cpos, u32 phys_cpos, u32 len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xv->xr_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+				   u32 old_clusters,
+				   u32 new_clusters,
+				   struct buffer_head *root_bh,
+				   struct ocfs2_xattr_value_root *xv)
+{
+	int ret = 0;
+	u32 trunc_len, cpos, phys_cpos, alloc_size;
+	u64 block;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	if (old_clusters <= new_clusters)
+		return 0;
+
+	cpos = new_clusters;
+	trunc_len = old_clusters - new_clusters;
+	while (trunc_len) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+					       &alloc_size, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (alloc_size > trunc_len)
+			alloc_size = trunc_len;
+
+		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+						 phys_cpos, alloc_size,
+						 &dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+		ocfs2_remove_xattr_clusters_from_cache(inode, block,
+						       alloc_size);
+		cpos += alloc_size;
+		trunc_len -= alloc_size;
+	}
+
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+				      struct buffer_head *root_bh,
+				      struct ocfs2_xattr_value_root *xv,
+				      int len)
+{
+	int ret;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+	if (new_clusters == old_clusters)
+		return 0;
+
+	if (new_clusters > old_clusters)
+		ret = ocfs2_xattr_extend_allocation(inode,
+						    new_clusters - old_clusters,
+						    root_bh, xv);
+	else
+		ret = ocfs2_xattr_shrink_size(inode,
+					      old_clusters, new_clusters,
+					      root_bh, xv);
+
+	return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+				  size_t *result, const char *prefix,
+				  const char *name, int name_len)
+{
+	char *p = buffer + *result;
+	int prefix_len = strlen(prefix);
+	int total_len = prefix_len + name_len + 1;
+
+	*result += total_len;
+
+	/* we are just looking for how big our buffer needs to be */
+	if (!size)
+		return 0;
+
+	if (*result > size)
+		return -ERANGE;
+
+	memcpy(p, prefix, prefix_len);
+	memcpy(p + prefix_len, name, name_len);
+	p[prefix_len + name_len] = '\0';
+
+	return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+				    struct ocfs2_xattr_header *header,
+				    char *buffer, size_t buffer_size)
+{
+	size_t result = 0;
+	int i, type, ret;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			name = (const char *)header +
+				le16_to_cpu(entry->xe_name_offset);
+
+			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+						     &result, prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return result;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct ocfs2_xattr_header *header = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+				  struct ocfs2_dinode *di,
+				  char *buffer,
+				  size_t buffer_size)
+{
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+		ret = ocfs2_xattr_list_entries(inode, header,
+					       buffer, buffer_size);
+	} else {
+		struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+						   buffer, buffer_size);
+	}
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+			char *buffer,
+			size_t size)
+{
+	int ret = 0, i_ret = 0, b_ret = 0;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return ret;
+
+	ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+	if (i_ret < 0)
+		b_ret = 0;
+	else {
+		if (buffer) {
+			buffer += i_ret;
+			size -= i_ret;
+		}
+		b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+					       buffer, size);
+		if (b_ret < 0)
+			i_ret = 0;
+	}
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(dentry->d_inode, 0);
+
+	brelse(di_bh);
+
+	return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_entry *entry;
+	size_t name_len;
+	int i, cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	name_len = strlen(name);
+	entry = xs->here;
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		cmp = name_index - ocfs2_xattr_get_type(entry);
+		if (!cmp)
+			cmp = name_len - entry->xe_name_len;
+		if (!cmp)
+			cmp = memcmp(name, (xs->base +
+				     le16_to_cpu(entry->xe_name_offset)),
+				     name_len);
+		if (cmp == 0)
+			break;
+		entry += 1;
+	}
+	xs->here = entry;
+
+	return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_value_root *xv,
+					 void *buffer,
+					 size_t len)
+{
+	u32 cpos, p_cluster, num_clusters, bpc, clusters;
+	u64 blkno;
+	int i, ret = 0;
+	size_t cplen, blocksize;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_extent_list *el;
+
+	el = &xv->xr_list;
+	clusters = le32_to_cpu(xv->xr_clusters);
+	bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	blocksize = inode->i_sb->s_blocksize;
+
+	cpos = 0;
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		/* Copy ocfs2_xattr_value */
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(inode, blkno, &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			cplen = len >= blocksize ? blocksize : len;
+			memcpy(buffer, bh->b_data, cplen);
+			len -= cplen;
+			buffer += cplen;
+
+			brelse(bh);
+			bh = NULL;
+			if (len == 0)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+		return -ENODATA;
+
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	if (ret)
+		return ret;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		if (size > buffer_size)
+			return -ERANGE;
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       le16_to_cpu(xs->here->xe_name_offset) +
+			       OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + le16_to_cpu(
+				 xs->here->xe_name_offset) +
+				OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+				 int name_index,
+				 const char *name,
+				 void *buffer,
+				 size_t buffer_size,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_xattr_value_root *xv;
+	size_t size;
+	int ret = -ENODATA, name_offset, name_len, block_off, i;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret)
+		goto cleanup;
+	size = le64_to_cpu(xs->here->xe_value_size);
+	if (buffer) {
+		ret = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+
+		name_offset = le16_to_cpu(xs->here->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+		i = xs->here - xs->header->xh_entries;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								xs->bucket.xh,
+								i,
+								&block_off,
+								&name_offset);
+			xs->base = xs->bucket.bhs[block_off]->b_data;
+		}
+		if (ocfs2_xattr_is_local(xs->here)) {
+			memcpy(buffer, (void *)xs->base +
+			       name_offset + name_len, size);
+		} else {
+			xv = (struct ocfs2_xattr_value_root *)
+				(xs->base + name_offset + name_len);
+			ret = ocfs2_xattr_get_value_outside(inode, xv,
+							    buffer, size);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto cleanup;
+			}
+		}
+	}
+	ret = size;
+cleanup:
+	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+		brelse(xs->bucket.bhs[i]);
+	memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+	brelse(blk_bh);
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    void *buffer,
+		    size_t buffer_size)
+{
+	int ret;
+	struct ocfs2_dinode *di = NULL;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		ret = -ENODATA;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_read(&oi->ip_xattr_sem);
+	ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+				    buffer_size, &xis);
+	if (ret == -ENODATA)
+		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+					    buffer_size, &xbs);
+	up_read(&oi->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   struct ocfs2_xattr_value_root *xv,
+					   const void *value,
+					   int value_len)
+{
+	int ret = 0, i, cp_len, credits;
+	u16 blocksize = inode->i_sb->s_blocksize;
+	u32 p_cluster, num_clusters;
+	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+	u64 blkno;
+	struct buffer_head *bh = NULL;
+	handle_t *handle;
+
+	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+	credits = clusters * bpc;
+	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	while (cpos < clusters) {
+		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+					       &num_clusters, &xv->xr_list);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+			ret = ocfs2_read_block(inode, blkno, &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			ret = ocfs2_journal_access(handle,
+						   inode,
+						   bh,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+
+			cp_len = value_len > blocksize ? blocksize : value_len;
+			memcpy(bh->b_data, value, cp_len);
+			value_len -= cp_len;
+			value += cp_len;
+			if (cp_len < blocksize)
+				memset(bh->b_data + cp_len, 0,
+				       blocksize - cp_len);
+
+			ret = ocfs2_journal_dirty(handle, bh);
+			if (ret < 0) {
+				mlog_errno(ret);
+				goto out_commit;
+			}
+			brelse(bh);
+			bh = NULL;
+
+			/*
+			 * XXX: do we need to empty all the following
+			 * blocks in this cluster?
+			 */
+			if (!value_len)
+				break;
+		}
+		cpos += num_clusters;
+	}
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_cleanup(struct inode *inode,
+			       struct ocfs2_xattr_info *xi,
+			       struct ocfs2_xattr_search *xs,
+			       size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+	/* Decrease xattr count */
+	le16_add_cpu(&xs->header->xh_count, -1);
+	/* Remove the xattr entry and tree root which has already be set*/
+	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+	memset(val, 0, size);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_update_entry(struct inode *inode,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xs,
+				    size_t offs)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xs->here->xe_name_offset = cpu_to_le16(offs);
+	xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+	if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+		ocfs2_xattr_set_local(xs->here, 1);
+	else
+		ocfs2_xattr_set_local(xs->here, 0);
+	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+
+	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 size_t offs)
+{
+	size_t name_len = strlen(xi->name);
+	void *val = xs->base + offs;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	int ret = 0;
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	xv = (struct ocfs2_xattr_value_root *)
+		(val + OCFS2_XATTR_SIZE(name_len));
+	xv->xr_clusters = 0;
+	xv->xr_last_eb_blk = 0;
+	xv->xr_list.l_tree_depth = 0;
+	xv->xr_list.l_count = cpu_to_le16(1);
+	xv->xr_list.l_next_free_rec = 0;
+
+	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+					 xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+					      xi->value_len);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+					struct ocfs2_xattr_info *xi,
+					struct ocfs2_xattr_search *xs,
+					struct ocfs2_xattr_entry *last,
+					size_t min_offs)
+{
+	size_t name_len = strlen(xi->name);
+	int i;
+
+	if (xi->value && xs->not_found) {
+		/* Insert the new xattr entry. */
+		le16_add_cpu(&xs->header->xh_count, 1);
+		ocfs2_xattr_set_type(last, xi->name_index);
+		ocfs2_xattr_set_local(last, 1);
+		last->xe_name_len = name_len;
+	} else {
+		void *first_val;
+		void *val;
+		size_t offs, size;
+
+		first_val = xs->base + min_offs;
+		offs = le16_to_cpu(xs->here->xe_name_offset);
+		val = xs->base + offs;
+
+		if (le64_to_cpu(xs->here->xe_value_size) >
+		    OCFS2_XATTR_INLINE_SIZE)
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+		if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len)) {
+			/* The old and the new value have the
+			   same size. Just replace the value. */
+			ocfs2_xattr_set_local(xs->here, 1);
+			xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+			/* Clear value bytes. */
+			memset(val + OCFS2_XATTR_SIZE(name_len),
+			       0,
+			       OCFS2_XATTR_SIZE(xi->value_len));
+			memcpy(val + OCFS2_XATTR_SIZE(name_len),
+			       xi->value,
+			       xi->value_len);
+			return;
+		}
+		/* Remove the old name+value. */
+		memmove(first_val + size, first_val, val - first_val);
+		memset(first_val, 0, size);
+		xs->here->xe_name_hash = 0;
+		xs->here->xe_name_offset = 0;
+		ocfs2_xattr_set_local(xs->here, 1);
+		xs->here->xe_value_size = 0;
+
+		min_offs += size;
+
+		/* Adjust all value offsets. */
+		last = xs->header->xh_entries;
+		for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+			size_t o = le16_to_cpu(last->xe_name_offset);
+
+			if (o < offs)
+				last->xe_name_offset = cpu_to_le16(o + size);
+			last += 1;
+		}
+
+		if (!xi->value) {
+			/* Remove the old entry. */
+			last -= 1;
+			memmove(xs->here, xs->here + 1,
+				(void *)last - (void *)xs->here);
+			memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+			le16_add_cpu(&xs->header->xh_count, -1);
+		}
+	}
+	if (xi->value) {
+		/* Insert the new name+value. */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_SIZE(xi->value_len);
+		void *val = xs->base + min_offs - size;
+
+		xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+		memset(val, 0, size);
+		memcpy(val, xi->name, name_len);
+		memcpy(val + OCFS2_XATTR_SIZE(name_len),
+		       xi->value,
+		       xi->value_len);
+		xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+		ocfs2_xattr_set_local(xs->here, 1);
+		ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+	}
+
+	return;
+}
+
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs,
+				 int flag)
+{
+	struct ocfs2_xattr_entry *last;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+	size_t size_l = 0;
+	handle_t *handle = NULL;
+	int free, i, ret;
+	struct ocfs2_xattr_info xi_l = {
+		.name_index = xi->name_index,
+		.name = xi->name,
+		.value = xi->value,
+		.value_len = xi->value_len,
+	};
+
+	/* Compute min_offs, last and free space. */
+	last = xs->header->xh_entries;
+
+	for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	if (free < 0)
+		return -EFAULT;
+
+	if (!xs->not_found) {
+		size_t size = 0;
+		if (ocfs2_xattr_is_local(xs->here))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+				OCFS2_XATTR_ROOT_SIZE;
+		free += (size + sizeof(struct ocfs2_xattr_entry));
+	}
+	/* Check free space in inode or block */
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_ROOT_SIZE) {
+			ret = -ENOSPC;
+			goto out;
+		}
+		size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+		xi_l.value = (void *)&def_xv;
+		xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+	} else if (xi->value) {
+		if (free < sizeof(struct ocfs2_xattr_entry) +
+			   OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	if (!xs->not_found) {
+		/* For existing extended attribute */
+		size_t size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		void *val = xs->base + offs;
+
+		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+			/* Replace existing local xattr with tree root */
+			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+							    offs);
+			if (ret < 0)
+				mlog_errno(ret);
+			goto out;
+		} else if (!ocfs2_xattr_is_local(xs->here)) {
+			/* For existing xattr which has value outside */
+			struct ocfs2_xattr_value_root *xv = NULL;
+			xv = (struct ocfs2_xattr_value_root *)(val +
+				OCFS2_XATTR_SIZE(name_len));
+
+			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+				/*
+				 * If new value need set outside also,
+				 * first truncate old value to new value,
+				 * then set new value with set_value_outside().
+				 */
+				ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								xv,
+								xi->value,
+								xi->value_len);
+				if (ret < 0) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_xattr_update_entry(inode,
+							       xi,
+							       xs,
+							       offs);
+				if (ret < 0)
+					mlog_errno(ret);
+				goto out;
+			} else {
+				/*
+				 * If new value need set in local,
+				 * just trucate old value to zero.
+				 */
+				 ret = ocfs2_xattr_value_truncate(inode,
+								 xs->xattr_bh,
+								 xv,
+								 0);
+				if (ret < 0)
+					mlog_errno(ret);
+			}
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		/* set extended attribute in external block. */
+		ret = ocfs2_extend_trans(handle,
+					 OCFS2_INODE_UPDATE_CREDITS +
+					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	/*
+	 * Set value in local, include set tree root in local.
+	 * This is the first step for value size >INLINE_SIZE.
+	 */
+	ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+	    (flag & OCFS2_INLINE_XATTR_FL)) {
+		struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+		unsigned int xattrsize = osb->s_xattr_inline_size;
+
+		/*
+		 * Adjust extent record count or inline data size
+		 * to reserve space for extended attribute.
+		 */
+		if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			struct ocfs2_inline_data *idata = &di->id2.i_data;
+			le16_add_cpu(&idata->id_count, -xattrsize);
+		} else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+			struct ocfs2_extent_list *el = &di->id2.i_list;
+			le16_add_cpu(&el->l_count, -(xattrsize /
+					sizeof(struct ocfs2_extent_rec)));
+		}
+		di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+	}
+	/* Update xattr flag */
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= flag;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+	/* Update inode ctime */
+	inode->i_ctime = CURRENT_TIME;
+	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * Set value outside in B tree.
+		 * This is the second step for value size > INLINE_SIZE.
+		 */
+		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+		if (ret < 0) {
+			int ret2;
+
+			mlog_errno(ret);
+			/*
+			 * If set value outside failed, we have to clean
+			 * the junk tree root we have already set in local.
+			 */
+			ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+			if (ret2 < 0)
+				mlog_errno(ret2);
+		}
+	}
+out:
+	return ret;
+
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+				      struct buffer_head *bh,
+				      struct ocfs2_xattr_header *header)
+{
+	int ret = 0, i;
+
+	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+		if (!ocfs2_xattr_is_local(entry)) {
+			struct ocfs2_xattr_value_root *xv;
+			void *val;
+
+			val = (void *)header +
+				le16_to_cpu(entry->xe_name_offset);
+			xv = (struct ocfs2_xattr_value_root *)
+				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+			if (ret < 0) {
+				mlog_errno(ret);
+				return ret;
+			}
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+				    struct buffer_head *di_bh)
+{
+
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_xattr_header *header;
+	int ret;
+
+	header = (struct ocfs2_xattr_header *)
+		 ((void *)di + inode->i_sb->s_blocksize -
+		 le16_to_cpu(di->i_xattr_inline_size));
+
+	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+
+	return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+				    struct buffer_head *blk_bh)
+{
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+	} else
+		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+
+	return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+				  u64 block)
+{
+	struct inode *xb_alloc_inode;
+	struct buffer_head *xb_alloc_bh = NULL;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle;
+	int ret = 0;
+	u64 blk, bg_blkno;
+	u16 bit;
+
+	ret = ocfs2_read_block(inode, block, &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_block_remove(inode, blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+	blk = le64_to_cpu(xb->xb_blkno);
+	bit = le16_to_cpu(xb->xb_suballoc_bit);
+	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+	xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+				EXTENT_ALLOC_SYSTEM_INODE,
+				le16_to_cpu(xb->xb_suballoc_slot));
+	if (!xb_alloc_inode) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	mutex_lock(&xb_alloc_inode->i_mutex);
+
+	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_mutex;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+				       bit, bg_blkno, 1);
+	if (ret < 0)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+out_unlock:
+	ocfs2_inode_unlock(xb_alloc_inode, 1);
+	brelse(xb_alloc_bh);
+out_mutex:
+	mutex_unlock(&xb_alloc_inode->i_mutex);
+	iput(xb_alloc_inode);
+out:
+	brelse(blk_bh);
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+	int ret;
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (di->i_xattr_loc) {
+		ret = ocfs2_xattr_free_block(inode,
+					     le64_to_cpu(di->i_xattr_loc));
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	di->i_xattr_loc = 0;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+	return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+					struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+	int free;
+
+	if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+		return 0;
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		struct ocfs2_inline_data *idata = &di->id2.i_data;
+		free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+	} else if (ocfs2_inode_is_fast_symlink(inode)) {
+		free = ocfs2_fast_symlink_chars(inode->i_sb) -
+			le64_to_cpu(di->i_size);
+	} else {
+		struct ocfs2_extent_list *el = &di->id2.i_list;
+		free = (le16_to_cpu(el->l_count) -
+			le16_to_cpu(el->l_next_free_rec)) *
+			sizeof(struct ocfs2_extent_rec);
+	}
+	if (free >= xattrsize)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+	int has_space = 0;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return 0;
+
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		down_read(&oi->ip_alloc_sem);
+		has_space = ocfs2_xattr_has_space_inline(inode, di);
+		up_read(&oi->ip_alloc_sem);
+		if (!has_space)
+			return 0;
+	}
+
+	xs->xattr_bh = xs->inode_bh;
+	xs->end = (void *)di + inode->i_sb->s_blocksize;
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - le16_to_cpu(di->i_xattr_inline_size));
+	else
+		xs->header = (struct ocfs2_xattr_header *)
+			(xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+	xs->base = (void *)xs->header;
+	xs->here = xs->header->xh_entries;
+
+	/* Find the named attribute. */
+	if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+		if (ret && ret != -ENODATA)
+			return ret;
+		xs->not_found = ret;
+	}
+
+	return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	int ret;
+
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+		return -ENOSPC;
+
+	down_write(&oi->ip_alloc_sem);
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+		if (!ocfs2_xattr_has_space_inline(inode, di)) {
+			ret = -ENOSPC;
+			goto out;
+		}
+	}
+
+	ret = ocfs2_xattr_set_entry(inode, xi, xs,
+				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+	up_write(&oi->ip_alloc_sem);
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+				  int name_index,
+				  const char *name,
+				  struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct buffer_head *blk_bh = NULL;
+	struct ocfs2_xattr_block *xb;
+	int ret = 0;
+
+	if (!di->i_xattr_loc)
+		return ret;
+
+	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	/*Verify the signature of xattr block*/
+	if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+		   strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+			ret = -EFAULT;
+			goto cleanup;
+	}
+
+	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		xs->header = &xb->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+		xs->here = xs->header->xh_entries;
+
+		ret = ocfs2_xattr_find_entry(name_index, name, xs);
+	} else
+		ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+						   name_index,
+						   name, xs);
+
+	if (ret && ret != -ENODATA) {
+		xs->xattr_bh = NULL;
+		goto cleanup;
+	}
+	xs->not_found = ret;
+	return 0;
+cleanup:
+	brelse(blk_bh);
+
+	return ret;
+}
+
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+	BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+		le16_to_cpu(el->l_next_free_rec) != 0);
+
+	handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+	ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+				 struct ocfs2_xattr_info *xi,
+				 struct ocfs2_xattr_search *xs)
+{
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_block *xblk = NULL;
+	u16 suballoc_bit_start;
+	u32 num_got;
+	u64 first_blkno;
+	int ret;
+
+	if (!xs->xattr_bh) {
+		/*
+		 * Alloc one external block for extended attribute
+		 * outside of inode.
+		 */
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		handle = ocfs2_start_trans(osb,
+					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+					   &suballoc_bit_start, &num_got,
+					   &first_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		new_bh = sb_getblk(inode->i_sb, first_blkno);
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/* Initialize ocfs2_xattr_block */
+		xs->xattr_bh = new_bh;
+		xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+		memset(xblk, 0, inode->i_sb->s_blocksize);
+		strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+		xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+		xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+		xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+		xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+		xs->header = &xblk->xb_attrs.xb_header;
+		xs->base = (void *)xs->header;
+		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+		xs->here = xs->header->xh_entries;
+
+
+		ret = ocfs2_journal_dirty(handle, new_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		di->i_xattr_loc = cpu_to_le64(first_blkno);
+		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+		if (ret < 0)
+			mlog_errno(ret);
+out_commit:
+		ocfs2_commit_trans(osb, handle);
+out:
+		if (meta_ac)
+			ocfs2_free_alloc_context(meta_ac);
+		if (ret < 0)
+			return ret;
+	} else
+		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+		/* Set extended attribute into external block */
+		ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+		if (!ret || ret != -ENOSPC)
+			goto end;
+
+		ret = ocfs2_xattr_create_index_block(inode, xs);
+		if (ret)
+			goto end;
+	}
+
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+		ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
+
+	return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+		    int name_index,
+		    const char *name,
+		    const void *value,
+		    size_t value_len,
+		    int flags)
+{
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	int ret;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+	/*
+	 * Scan inode and external block to find the same name
+	 * extended attribute and collect search infomation.
+	 */
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	if (xis.not_found && xbs.not_found) {
+		ret = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		ret = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		ret = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+
+	if (!value) {
+		/* Remove existing extended attribute */
+		if (!xis.not_found)
+			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		else if (!xbs.not_found)
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		if (!ret && !xbs.not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi.value = NULL;
+			xi.value_len = 0;
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs.xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode, name_index,
+							     name, &xbs);
+				if (ret)
+					goto cleanup;
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			if (ret)
+				goto cleanup;
+			if (!xis.not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi.value = NULL;
+				xi.value_len = 0;
+				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+			}
+		}
+	}
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	ocfs2_inode_unlock(inode, 1);
+	brelse(di_bh);
+	brelse(xbs.xattr_bh);
+	for (i = 0; i < blk_per_bucket; i++)
+		brelse(xbs.bucket.bhs[i]);
+
+	return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+			       u32 name_hash,
+			       u64 *p_blkno,
+			       u32 *e_cpos,
+			       u32 *num_clusters,
+			       struct ocfs2_extent_list *el)
+{
+	int ret = 0, i;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_rec *rec = NULL;
+	u64 e_blkno = 0;
+
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
+
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "xattr tree block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
+		}
+	}
+
+	for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+		rec = &el->l_recs[i];
+
+		if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+			e_blkno = le64_to_cpu(rec->e_blkno);
+			break;
+		}
+	}
+
+	if (!e_blkno) {
+		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+			    "record (%u, %u, 0) in xattr", inode->i_ino,
+			    le32_to_cpu(rec->e_cpos),
+			    ocfs2_rec_clusters(el, rec));
+		ret = -EROFS;
+		goto out;
+	}
+
+	*p_blkno = le64_to_cpu(rec->e_blkno);
+	*num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	if (e_cpos)
+		*e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+	brelse(eb_bh);
+	return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+				struct ocfs2_xattr_bucket *bucket,
+				void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+				   struct buffer_head *header_bh,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u16 *xe_index,
+				   int *found)
+{
+	int i, ret = 0, cmp = 1, block_off, new_offset;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t name_len = strlen(name);
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct buffer_head *name_bh = NULL;
+	char *xe_name;
+
+	/*
+	 * We don't use binary search in the bucket because there
+	 * may be multiple entries with the same name hash.
+	 */
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash))
+			continue;
+		else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+			break;
+
+		cmp = name_index - ocfs2_xattr_get_type(xe);
+		if (!cmp)
+			cmp = name_len - xe->xe_name_len;
+		if (cmp)
+			continue;
+
+		ret = ocfs2_xattr_bucket_get_name_value(inode,
+							xh,
+							i,
+							&block_off,
+							&new_offset);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+				       &name_bh);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		xe_name = name_bh->b_data + new_offset;
+
+		cmp = memcmp(name, xe_name, name_len);
+		brelse(name_bh);
+		name_bh = NULL;
+
+		if (cmp == 0) {
+			*xe_index = i;
+			*found = 1;
+			ret = 0;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+				   int name_index,
+				   const char *name,
+				   u32 name_hash,
+				   u64 p_blkno,
+				   u32 first_hash,
+				   u32 num_clusters,
+				   struct ocfs2_xattr_search *xs)
+{
+	int ret, found = 0;
+	struct buffer_head *bh = NULL;
+	struct buffer_head *lower_bh = NULL;
+	struct ocfs2_xattr_header *xh = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	u16 index = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int low_bucket = 0, bucket, high_bucket;
+	u32 last_hash;
+	u64 blkno;
+
+	ret = ocfs2_read_block(inode, p_blkno, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+
+	while (low_bucket <= high_bucket) {
+		brelse(bh);
+		bh = NULL;
+		bucket = (low_bucket + high_bucket) / 2;
+
+		blkno = p_blkno + bucket * blk_per_bucket;
+
+		ret = ocfs2_read_block(inode, blkno, &bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xe = &xh->xh_entries[0];
+		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+			high_bucket = bucket - 1;
+			continue;
+		}
+
+		/*
+		 * Check whether the hash of the last entry in our
+		 * bucket is larger than the search one. for an empty
+		 * bucket, the last one is also the first one.
+		 */
+		if (xh->xh_count)
+			xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+		last_hash = le32_to_cpu(xe->xe_name_hash);
+
+		/* record lower_bh which may be the insert place. */
+		brelse(lower_bh);
+		lower_bh = bh;
+		bh = NULL;
+
+		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+			low_bucket = bucket + 1;
+			continue;
+		}
+
+		/* the searched xattr should reside in this bucket if exists. */
+		ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+					      name_index, name, name_hash,
+					      &index, &found);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		break;
+	}
+
+	/*
+	 * Record the bucket we have found.
+	 * When the xattr's hash value is in the gap of 2 buckets, we will
+	 * always set it to the previous bucket.
+	 */
+	if (!lower_bh) {
+		/*
+		 * We can't find any bucket whose first name_hash is less
+		 * than the find name_hash.
+		 */
+		BUG_ON(bh->b_blocknr != p_blkno);
+		lower_bh = bh;
+		bh = NULL;
+	}
+	xs->bucket.bhs[0] = lower_bh;
+	xs->bucket.xh = (struct ocfs2_xattr_header *)
+					xs->bucket.bhs[0]->b_data;
+	lower_bh = NULL;
+
+	xs->header = xs->bucket.xh;
+	xs->base = xs->bucket.bhs[0]->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (found) {
+		/*
+		 * If we have found the xattr enty, read all the blocks in
+		 * this bucket.
+		 */
+		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		xs->here = &xs->header->xh_entries[index];
+		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+		     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+	} else
+		ret = -ENODATA;
+
+out:
+	brelse(bh);
+	brelse(lower_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+					struct buffer_head *root_bh,
+					int name_index,
+					const char *name,
+					struct ocfs2_xattr_search *xs)
+{
+	int ret;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	u64 p_blkno = 0;
+	u32 first_hash, num_clusters = 0;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return -ENODATA;
+
+	mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+	     name, name_hash, name_index);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+	     "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+				      p_blkno, first_hash, num_clusters, xs);
+
+out:
+	return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+				       u64 blkno,
+				       u32 clusters,
+				       xattr_bucket_func *func,
+				       void *para)
+{
+	int i, j, ret = 0;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+	u32 num_buckets = clusters * bpc;
+	struct ocfs2_xattr_bucket bucket;
+
+	memset(&bucket, 0, sizeof(bucket));
+
+	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+	     clusters, blkno);
+
+	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+					bucket.bhs, OCFS2_BH_CACHED);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+		/*
+		 * The real bucket num in this series of blocks is stored
+		 * in the 1st bucket.
+		 */
+		if (i == 0)
+			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+
+		mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+		if (func) {
+			ret = func(inode, &bucket, para);
+			if (ret) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		for (j = 0; j < blk_per_bucket; j++)
+			brelse(bucket.bhs[j]);
+		memset(&bucket, 0, sizeof(bucket));
+	}
+
+out:
+	for (j = 0; j < blk_per_bucket; j++)
+		brelse(bucket.bhs[j]);
+
+	return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+	char *buffer;
+	size_t buffer_size;
+	size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+					     struct ocfs2_xattr_header *xh,
+					     int index,
+					     int *block_off,
+					     int *new_offset)
+{
+	u16 name_offset;
+
+	if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+		return -EINVAL;
+
+	name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+	*block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+	*new_offset = name_offset % inode->i_sb->s_blocksize;
+
+	return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   void *para)
+{
+	int ret = 0, type;
+	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+	int i, block_off, new_offset;
+	const char *prefix, *name;
+
+	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+		type = ocfs2_xattr_get_type(entry);
+		prefix = ocfs2_xattr_prefix(type);
+
+		if (prefix) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+								bucket->xh,
+								i,
+								&block_off,
+								&new_offset);
+			if (ret)
+				break;
+
+			name = (const char *)bucket->bhs[block_off]->b_data +
+				new_offset;
+			ret = ocfs2_xattr_list_entry(xl->buffer,
+						     xl->buffer_size,
+						     &xl->result,
+						     prefix, name,
+						     entry->xe_name_len);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+					     struct ocfs2_xattr_tree_root *xt,
+					     char *buffer,
+					     size_t buffer_size)
+{
+	struct ocfs2_extent_list *el = &xt->xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+	u64 p_blkno = 0;
+	struct ocfs2_xattr_tree_list xl = {
+		.buffer = buffer,
+		.buffer_size = buffer_size,
+		.result = 0,
+	};
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_list_xattr_bucket,
+						  &xl);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+	ret = xl.result;
+out:
+	return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_hash = le32_to_cpu(l->xe_name_hash);
+	u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+	if (l_hash > r_hash)
+		return 1;
+	if (l_hash < r_hash)
+		return -1;
+	return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+	struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+	tmp = *l;
+	memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+	memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+					   struct buffer_head *xb_bh,
+					   struct buffer_head *xh_bh,
+					   struct buffer_head *data_bh)
+{
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 offset, size, off_change;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_block *xb =
+				(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+	struct ocfs2_xattr_header *xh =
+				(struct ocfs2_xattr_header *)xh_bh->b_data;
+	u16 count = le16_to_cpu(xb_xh->xh_count);
+	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+	mlog(0, "cp xattr from block %llu to bucket %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr,
+	     (unsigned long long)xh_bh->b_blocknr);
+
+	memset(xh_bh->b_data, 0, blocksize);
+	if (data_bh)
+		memset(data_bh->b_data, 0, blocksize);
+	/*
+	 * Since the xe_name_offset is based on ocfs2_xattr_header,
+	 * there is a offset change corresponding to the change of
+	 * ocfs2_xattr_header's position.
+	 */
+	off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	xe = &xb_xh->xh_entries[count - 1];
+	offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+	size = blocksize - offset;
+
+	/* copy all the names and values. */
+	if (data_bh)
+		target = data_bh->b_data;
+	memcpy(target + offset, src + offset, size);
+
+	/* Init new header now. */
+	xh->xh_count = xb_xh->xh_count;
+	xh->xh_num_buckets = cpu_to_le16(1);
+	xh->xh_name_value_len = cpu_to_le16(size);
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+	/* copy all the entries. */
+	target = xh_bh->b_data;
+	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+	size = count * sizeof(struct ocfs2_xattr_entry);
+	memcpy(target + offset, (char *)xb_xh + offset, size);
+
+	/* Change the xe offset for all the xe because of the move. */
+	off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+		 offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+	for (i = 0; i < count; i++)
+		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+	     offset, size, off_change);
+
+	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+					   struct ocfs2_xattr_search *xs,
+					   struct buffer_head *old_bh,
+					   struct buffer_head *new_bh)
+{
+	int ret = 0;
+	char *buf = old_bh->b_data;
+	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+	int i, blocksize = inode->i_sb->s_blocksize;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	xs->bucket.bhs[0] = new_bh;
+	get_bh(new_bh);
+	xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+	xs->header = xs->bucket.xh;
+
+	xs->base = new_bh->b_data;
+	xs->end = xs->base + inode->i_sb->s_blocksize;
+
+	if (!xs->not_found) {
+		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+			ret = ocfs2_read_blocks(inode,
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED);
+			if (ret) {
+				mlog_errno(ret);
+				return ret;
+			}
+
+			i = xs->here - old_xh->xh_entries;
+			xs->here = &xs->header->xh_entries[i];
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+					  struct ocfs2_xattr_search *xs)
+{
+	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	u32 bit_off, len;
+	u64 blkno;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_alloc_context *data_ac;
+	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+	struct buffer_head *xb_bh = xs->xattr_bh;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xr;
+	u16 xb_flags = le16_to_cpu(xb->xb_flags);
+	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog(0, "create xattr index block for %llu\n",
+	     (unsigned long long)xb_bh->b_blocknr);
+
+	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * XXX:
+	 * We can use this lock for now, and maybe move to a dedicated mutex
+	 * if performance becomes a problem later.
+	 */
+	down_write(&oi->ip_alloc_sem);
+
+	/*
+	 * 3 more credits, one for xattr block update, one for the 1st block
+	 * of the new xattr bucket and one for the value/data.
+	 */
+	credits += 3;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_sem;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * The bucket may spread in many blocks, and
+	 * we will only touch the 1st block and the last block
+	 * in the whole bucket(one for entry and one for data).
+	 */
+	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+	mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+	xh_bh = sb_getblk(inode->i_sb, blkno);
+	if (!xh_bh) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+
+	ret = ocfs2_journal_access(handle, inode, xh_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (bpb > 1) {
+		data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+		if (!data_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, data_bh);
+
+		ret = ocfs2_journal_access(handle, inode, data_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+	}
+
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+	ocfs2_journal_dirty(handle, xh_bh);
+	if (data_bh)
+		ocfs2_journal_dirty(handle, data_bh);
+
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+	       offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+	xr = &xb->xb_attrs.xb_root;
+	xr->xt_clusters = cpu_to_le32(1);
+	xr->xt_last_eb_blk = 0;
+	xr->xt_list.l_tree_depth = 0;
+	xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+	xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+	xr->xt_list.l_recs[0].e_cpos = 0;
+	xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+	xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+	ret = ocfs2_journal_dirty(handle, xb_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_sem:
+	up_write(&oi->ip_alloc_sem);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	brelse(xh_bh);
+	brelse(data_bh);
+
+	return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+	const struct ocfs2_xattr_entry *l = a, *r = b;
+	u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+	u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+	if (l_name_offset < r_name_offset)
+		return 1;
+	if (l_name_offset > r_name_offset)
+		return -1;
+	return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int ret, i;
+	size_t end, offset, len, value_len;
+	struct ocfs2_xattr_header *xh;
+	char *entries, *buf, *bucket_buf = NULL;
+	u64 blkno = bucket->bhs[0]->b_blocknr;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u16 xh_free_start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	handle_t *handle;
+	struct buffer_head **bhs;
+	struct ocfs2_xattr_entry *xe;
+
+	bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
+				OCFS2_BH_CACHED);
+	if (ret)
+		goto out;
+
+	/*
+	 * In order to make the operation more efficient and generic,
+	 * we copy all the blocks into a contiguous memory and do the
+	 * defragment there, so if anything is error, we will not touch
+	 * the real block.
+	 */
+	bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+	if (!bucket_buf) {
+		ret = -EIO;
+		goto out;
+	}
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+		memcpy(buf, bhs[i]->b_data, blocksize);
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto commit;
+		}
+	}
+
+	xh = (struct ocfs2_xattr_header *)bucket_buf;
+	entries = (char *)xh->xh_entries;
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+	mlog(0, "adjust xattr bucket in %llu, count = %u, "
+	     "xh_free_start = %u, xh_name_value_len = %u.\n",
+	     blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	/*
+	 * sort all the entries by their offset.
+	 * the largest will be the first, so that we can
+	 * move them to the end one by one.
+	 */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe_offset, swap_xe);
+
+	/* Move all name/values to the end of the bucket. */
+	xe = xh->xh_entries;
+	end = OCFS2_XATTR_BUCKET_SIZE;
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+		offset = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			value_len = OCFS2_XATTR_SIZE(
+					le64_to_cpu(xe->xe_value_size));
+		else
+			value_len = OCFS2_XATTR_ROOT_SIZE;
+		len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+		/*
+		 * We must make sure that the name/value pair
+		 * exist in the same block. So adjust end to
+		 * the previous block end if needed.
+		 */
+		if (((end - len) / blocksize !=
+			(end - 1) / blocksize))
+			end = end - end % blocksize;
+
+		if (end > offset + len) {
+			memmove(bucket_buf + end - len,
+				bucket_buf + offset, len);
+			xe->xe_name_offset = cpu_to_le16(end - len);
+		}
+
+		mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+				"bucket %llu\n", (unsigned long long)blkno);
+
+		end -= len;
+	}
+
+	mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+			"bucket %llu\n", (unsigned long long)blkno);
+
+	if (xh_free_start == end)
+		goto commit;
+
+	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+	xh->xh_free_start = cpu_to_le16(end);
+
+	/* sort the entries by their name_hash. */
+	sort(entries, le16_to_cpu(xh->xh_count),
+	     sizeof(struct ocfs2_xattr_entry),
+	     cmp_xe, swap_xe);
+
+	buf = bucket_buf;
+	for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+		memcpy(bhs[i]->b_data, buf, blocksize);
+		ocfs2_journal_dirty(handle, bhs[i]);
+	}
+
+commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+
+	if (bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(bhs[i]);
+	}
+	kfree(bhs);
+
+	kfree(bucket_buf);
+	return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+					       handle_t *handle,
+					       struct buffer_head **first_bh,
+					       struct buffer_head **header_bh,
+					       u64 new_blkno,
+					       u64 prev_blkno,
+					       u32 num_clusters,
+					       u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+	struct ocfs2_xattr_header *new_xh;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
+
+	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+	prev_bh = *first_bh;
+	get_bh(prev_bh);
+	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+
+	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+	     prev_blkno, new_blkno);
+
+	/*
+	 * We need to update the 1st half of the new cluster and
+	 * 1 more for the update of the 1st bucket of the previous
+	 * extent record.
+	 */
+	credits = bpc / 2 + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, prev_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+		old_bh = new_bh = NULL;
+		new_bh = sb_getblk(inode->i_sb, new_blkno);
+		if (!new_bh) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+		ret = ocfs2_journal_access(handle, inode, new_bh,
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+		if (ret < 0) {
+			mlog_errno(ret);
+			brelse(new_bh);
+			goto out;
+		}
+
+		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+		if (i == 0) {
+			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+			new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+
+			if (first_hash)
+				*first_hash = le32_to_cpu(
+					new_xh->xh_entries[0].xe_name_hash);
+			new_first_bh = new_bh;
+			get_bh(new_first_bh);
+		}
+
+		ocfs2_journal_dirty(handle, new_bh);
+
+		if (*header_bh == old_bh) {
+			brelse(*header_bh);
+			*header_bh = new_bh;
+			get_bh(*header_bh);
+
+			brelse(*first_bh);
+			*first_bh = new_first_bh;
+			get_bh(*first_bh);
+		}
+		brelse(new_bh);
+		brelse(old_bh);
+	}
+
+	le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+
+	ocfs2_journal_dirty(handle, prev_bh);
+out:
+	brelse(prev_bh);
+	brelse(new_first_bh);
+	return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+				   u64 blkno,
+				   struct buffer_head **bhs,
+				   int new)
+{
+	int ret = 0;
+	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	if (!new)
+		return ocfs2_read_blocks(inode, blkno,
+					 blk_per_bucket, bhs,
+					 OCFS2_BH_CACHED);
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+		if (bhs[i] == NULL) {
+			ret = -EIO;
+			mlog_errno(ret);
+			break;
+		}
+		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+	}
+
+	return ret;
+}
+
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+				   handle_t *handle,
+				   u64 blk,
+				   u64 new_blk,
+				   u32 *first_hash,
+				   int new_bucket_head)
+{
+	int ret, i;
+	u16 count, start, len, name_value_len, xe_len, name_offset;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	int blocksize = inode->i_sb->s_blocksize;
+
+	mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+	     blk, new_blk);
+
+	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_CREATE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* copy the whole bucket to the new first. */
+	for (i = 0; i < blk_per_bucket; i++)
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+
+	/* update the new bucket. */
+	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+	count = le16_to_cpu(xh->xh_count);
+	start = count / 2;
+
+	/*
+	 * Calculate the total name/value len and xh_free_start for
+	 * the old bucket first.
+	 */
+	name_offset = OCFS2_XATTR_BUCKET_SIZE;
+	name_value_len = 0;
+	for (i = 0; i < start; i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		name_value_len += xe_len;
+		if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+			name_offset = le16_to_cpu(xe->xe_name_offset);
+	}
+
+	/*
+	 * Now begin the modification to the new bucket.
+	 *
+	 * In the new bucket, We just move the xattr entry to the beginning
+	 * and don't touch the name/value. So there will be some holes in the
+	 * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+	 * called.
+	 */
+	xe = &xh->xh_entries[start];
+	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+	     (int)((char *)xe - (char *)xh),
+	     (int)((char *)xh->xh_entries - (char *)xh));
+	memmove((char *)xh->xh_entries, (char *)xe, len);
+	xe = &xh->xh_entries[count - start];
+	len = sizeof(struct ocfs2_xattr_entry) * start;
+	memset((char *)xe, 0, len);
+
+	le16_add_cpu(&xh->xh_count, -start);
+	le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+	/* Calculate xh_free_start for the new bucket. */
+	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		if (ocfs2_xattr_is_local(xe))
+			xe_len +=
+			   OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			xe_len += OCFS2_XATTR_ROOT_SIZE;
+		if (le16_to_cpu(xe->xe_name_offset) <
+		    le16_to_cpu(xh->xh_free_start))
+			xh->xh_free_start = xe->xe_name_offset;
+	}
+
+	/* set xh->xh_num_buckets for the new xh. */
+	if (new_bucket_head)
+		xh->xh_num_buckets = cpu_to_le16(1);
+	else
+		xh->xh_num_buckets = 0;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+		if (ret)
+			mlog_errno(ret);
+	}
+
+	/* store the first_hash of the new bucket. */
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+	/*
+	 * Now only update the 1st block of the old bucket.
+	 * Please note that the entry has been sorted already above.
+	 */
+	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	memset(&xh->xh_entries[start], 0,
+	       sizeof(struct ocfs2_xattr_entry) * (count - start));
+	xh->xh_count = cpu_to_le16(start);
+	xh->xh_free_start = cpu_to_le16(name_offset);
+	xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+	ocfs2_journal_dirty(handle, s_bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new)
+{
+	int ret, i;
+	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = inode->i_sb->s_blocksize;
+	struct buffer_head **s_bhs, **t_bhs = NULL;
+
+	BUG_ON(s_blkno == t_blkno);
+
+	mlog(0, "cp bucket %llu to %llu, target is %d\n",
+	     s_blkno, t_blkno, t_is_new);
+
+	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!s_bhs)
+		return -ENOMEM;
+
+	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	if (ret)
+		goto out;
+
+	t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+			GFP_NOFS);
+	if (!t_bhs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret)
+			goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+		ocfs2_journal_dirty(handle, t_bhs[i]);
+	}
+
+out:
+	if (s_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(s_bhs[i]);
+	}
+	kfree(s_bhs);
+
+	if (t_bhs) {
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(t_bhs[i]);
+	}
+	kfree(t_bhs);
+
+	return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+				  handle_t *handle,
+				  struct buffer_head *first_bh,
+				  u64 src_blk,
+				  u64 to_blk,
+				  u32 *first_hash)
+{
+	int i, ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_header *xh;
+	u64 to_blk_start = to_blk;
+
+	mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+	/*
+	 * We need to update the new cluster and 1 more for the update of
+	 * the 1st bucket of the previous extent rec.
+	 */
+	credits = bpc + 1;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < num_buckets; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    src_blk, to_blk, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	}
+
+	/* update the old bucket header. */
+	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+
+	ocfs2_journal_dirty(handle, first_bh);
+
+	/* update the new bucket header. */
+	ret = ocfs2_read_block(inode, to_blk_start, &bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh->xh_num_buckets = cpu_to_le16(num_buckets);
+
+	ocfs2_journal_dirty(handle, bh);
+
+	if (first_hash)
+		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+	brelse(bh);
+	return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+				    handle_t *handle,
+				    u64 prev_blk,
+				    u64 new_blk,
+				    u32 *first_hash)
+{
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int ret, credits = 2 * blk_per_bucket;
+
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	return  ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+					new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ *    than 1 bucket, so just move half nums of bucket into the new cluster and
+ *    update the first_bh and header_bh if the insert bucket has been moved
+ *    to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ *    a) If the previous extent rec has more than one cluster and the insert
+ *       place isn't in the last cluster, copy the entire last cluster to the
+ *       new one. This time, we don't need to upate the first_bh and header_bh
+ *       since they will not be moved into the new cluster.
+ *    b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ *       the new one. And we set the extend flag to zero if the insert place is
+ *       moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+					    handle_t *handle,
+					    struct buffer_head **first_bh,
+					    struct buffer_head **header_bh,
+					    u64 new_blk,
+					    u64 prev_blk,
+					    u32 prev_clusters,
+					    u32 *v_start,
+					    int *extend)
+{
+	int ret = 0;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+	     prev_blk, prev_clusters, new_blk);
+
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+							  handle,
+							  first_bh,
+							  header_bh,
+							  new_blk,
+							  prev_blk,
+							  prev_clusters,
+							  v_start);
+	else {
+		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+						     last_blk, new_blk,
+						     v_start);
+		else {
+			ret = ocfs2_half_xattr_cluster(inode, handle,
+						       last_blk, new_blk,
+						       v_start);
+
+			if ((*header_bh)->b_blocknr == last_blk && extend)
+				*extend = 0;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+				       struct buffer_head *root_bh,
+				       struct buffer_head **first_bh,
+				       struct buffer_head **header_bh,
+				       u32 *num_clusters,
+				       u32 prev_cpos,
+				       u64 prev_blkno,
+				       int *extend)
+{
+	int ret, credits;
+	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 prev_clusters = *num_clusters;
+	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+	u64 block;
+	handle_t *handle = NULL;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_extent_tree et;
+
+	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+	     "previous xattr blkno = %llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     prev_cpos, prev_blkno);
+
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+				    &data_ac, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+					    clusters_to_add);
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+				     clusters_to_add, &bit_off, &num_bits);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(num_bits > clusters_to_add);
+
+	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (prev_blkno + prev_clusters * bpc == block &&
+	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+		/*
+		 * If this cluster is contiguous with the old one and
+		 * adding this new cluster, we don't surpass the limit of
+		 * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+		 * initialized and used like other buckets in the previous
+		 * cluster.
+		 * So add it as a contiguous one. The caller will handle
+		 * its init process.
+		 */
+		v_start = prev_cpos + prev_clusters;
+		*num_clusters = prev_clusters + num_bits;
+		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+		     num_bits);
+	} else {
+		ret = ocfs2_adjust_xattr_cross_cluster(inode,
+						       handle,
+						       first_bh,
+						       header_bh,
+						       block,
+						       prev_blkno,
+						       prev_clusters,
+						       &v_start,
+						       extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+
+	if (handle->h_buffer_credits < credits) {
+		/*
+		 * The journal has been restarted before, and don't
+		 * have enough space for the insertion, so extend it
+		 * here.
+		 */
+		ret = ocfs2_extend_trans(handle, credits);
+		if (ret) {
+			mlog_errno(ret);
+			goto leave;
+		}
+	}
+	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+	     num_bits, block, v_start);
+	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+				  num_bits, 0, meta_ac);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+leave:
+	if (handle)
+		ocfs2_commit_trans(osb, handle);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     struct buffer_head *first_bh,
+				     struct buffer_head *start_bh,
+				     u32 num_clusters)
+{
+	int ret, credits;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	u64 start_blk = start_bh->b_blocknr, end_blk;
+	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+	handle_t *handle;
+	struct ocfs2_xattr_header *first_xh =
+				(struct ocfs2_xattr_header *)first_bh->b_data;
+	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+
+	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+	     "from %llu, len = %u\n", start_blk,
+	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+	BUG_ON(bucket >= num_buckets);
+
+	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+	/*
+	 * We will touch all the buckets after the start_bh(include it).
+	 * Add one more bucket and modify the first_bh.
+	 */
+	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, first_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto commit;
+	}
+
+	while (end_blk != start_blk) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+					    end_blk + blk_per_bucket, 0);
+		if (ret)
+			goto commit;
+		end_blk -= blk_per_bucket;
+	}
+
+	/* Move half of the xattr in start_blk to the next bucket. */
+	ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+				      start_blk + blk_per_bucket, NULL, 0);
+
+	le16_add_cpu(&first_xh->xh_num_buckets, 1);
+	ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+				      struct buffer_head *xb_bh,
+				      struct buffer_head *header_bh)
+{
+	struct ocfs2_xattr_header *first_xh = NULL;
+	struct buffer_head *first_bh = NULL;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+	struct ocfs2_extent_list *el = &xb_root->xt_list;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int ret, num_buckets, extend = 1;
+	u64 p_blkno;
+	u32 e_cpos, num_clusters;
+
+	mlog(0, "Add new xattr bucket starting form %llu\n",
+	     (unsigned long long)header_bh->b_blocknr);
+
+	/*
+	 * Add refrence for header_bh here because it may be
+	 * changed in ocfs2_add_new_xattr_cluster and we need
+	 * to free it in the end.
+	 */
+	get_bh(header_bh);
+
+	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+				  &num_clusters, el);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+	if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+		ret = ocfs2_add_new_xattr_cluster(inode,
+						  xb_bh,
+						  &first_bh,
+						  &header_bh,
+						  &num_clusters,
+						  e_cpos,
+						  p_blkno,
+						  &extend);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (extend)
+		ret = ocfs2_extend_xattr_bucket(inode,
+						first_bh,
+						header_bh,
+						num_clusters);
+	if (ret)
+		mlog_errno(ret);
+out:
+	brelse(first_bh);
+	brelse(header_bh);
+	return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					int offs)
+{
+	int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+	offs = offs % inode->i_sb->s_blocksize;
+	return bucket->bhs[block_off]->b_data + offs;
+}
+
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+					 struct ocfs2_xattr_info *xi,
+					 struct ocfs2_xattr_search *xs,
+					 u32 name_hash,
+					 int local)
+{
+	struct ocfs2_xattr_entry *last, *xe;
+	int name_len = strlen(xi->name);
+	struct ocfs2_xattr_header *xh = xs->header;
+	u16 count = le16_to_cpu(xh->xh_count), start;
+	size_t blocksize = inode->i_sb->s_blocksize;
+	char *val;
+	size_t offs, size, new_size;
+
+	last = &xh->xh_entries[count];
+	if (!xs->not_found) {
+		xe = xs->here;
+		offs = le16_to_cpu(xe->xe_name_offset);
+		if (ocfs2_xattr_is_local(xe))
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			size = OCFS2_XATTR_SIZE(name_len) +
+			OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		/*
+		 * If the new value will be stored outside, xi->value has been
+		 * initalized as an empty ocfs2_xattr_value_root, and the same
+		 * goes with xi->value_len, so we can set new_size safely here.
+		 * See ocfs2_xattr_set_in_bucket.
+		 */
+		new_size = OCFS2_XATTR_SIZE(name_len) +
+			   OCFS2_XATTR_SIZE(xi->value_len);
+
+		le16_add_cpu(&xh->xh_name_value_len, -size);
+		if (xi->value) {
+			if (new_size > size)
+				goto set_new_name_value;
+
+			/* Now replace the old value with new one. */
+			if (local)
+				xe->xe_value_size = cpu_to_le64(xi->value_len);
+			else
+				xe->xe_value_size = 0;
+
+			val = ocfs2_xattr_bucket_get_val(inode,
+							 &xs->bucket, offs);
+			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+			       size - OCFS2_XATTR_SIZE(name_len));
+			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+				memcpy(val + OCFS2_XATTR_SIZE(name_len),
+				       xi->value, xi->value_len);
+
+			le16_add_cpu(&xh->xh_name_value_len, new_size);
+			ocfs2_xattr_set_local(xe, local);
+			return;
+		} else {
+			/*
+			 * Remove the old entry if there is more than one.
+			 * We don't remove the last entry so that we can
+			 * use it to indicate the hash value of the empty
+			 * bucket.
+			 */
+			last -= 1;
+			le16_add_cpu(&xh->xh_count, -1);
+			if (xh->xh_count) {
+				memmove(xe, xe + 1,
+					(void *)last - (void *)xe);
+				memset(last, 0,
+				       sizeof(struct ocfs2_xattr_entry));
+			} else
+				xh->xh_free_start =
+					cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+
+			return;
+		}
+	} else {
+		/* find a new entry for insert. */
+		int low = 0, high = count - 1, tmp;
+		struct ocfs2_xattr_entry *tmp_xe;
+
+		while (low <= high && count) {
+			tmp = (low + high) / 2;
+			tmp_xe = &xh->xh_entries[tmp];
+
+			if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+				low = tmp + 1;
+			else if (name_hash <
+				 le32_to_cpu(tmp_xe->xe_name_hash))
+				high = tmp - 1;
+			else {
+				low = tmp;
+				break;
+			}
+		}
+
+		xe = &xh->xh_entries[low];
+		if (low != count)
+			memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+		le16_add_cpu(&xh->xh_count, 1);
+		memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+		xe->xe_name_hash = cpu_to_le32(name_hash);
+		xe->xe_name_len = name_len;
+		ocfs2_xattr_set_type(xe, xi->name_index);
+	}
+
+set_new_name_value:
+	/* Insert the new name+value. */
+	size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+
+	/*
+	 * We must make sure that the name/value pair
+	 * exists in the same block.
+	 */
+	offs = le16_to_cpu(xh->xh_free_start);
+	start = offs - size;
+
+	if (start >> inode->i_sb->s_blocksize_bits !=
+	    (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+		offs = offs - offs % blocksize;
+		xh->xh_free_start = cpu_to_le16(offs);
+	}
+
+	val = ocfs2_xattr_bucket_get_val(inode,
+					 &xs->bucket, offs - size);
+	xe->xe_name_offset = cpu_to_le16(offs - size);
+
+	memset(val, 0, size);
+	memcpy(val, xi->name, name_len);
+	memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+
+	xe->xe_value_size = cpu_to_le64(xi->value_len);
+	ocfs2_xattr_set_local(xe, local);
+	xs->here = xe;
+	le16_add_cpu(&xh->xh_free_start, -size);
+	le16_add_cpu(&xh->xh_name_value_len, size);
+
+	return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+					     handle_t *handle,
+					     struct ocfs2_xattr_search *xs,
+					     struct buffer_head **bhs,
+					     u16 bh_num)
+{
+	int ret = 0, off, block_off;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	/*
+	 * First calculate all the blocks we should journal_access
+	 * and journal_dirty. The first block should always be touched.
+	 */
+	ret = ocfs2_journal_dirty(handle, bhs[0]);
+	if (ret)
+		mlog_errno(ret);
+
+	/* calc the data. */
+	off = le16_to_cpu(xe->xe_name_offset);
+	block_off = off >> inode->i_sb->s_blocksize_bits;
+	ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   struct ocfs2_xattr_info *xi,
+					   struct ocfs2_xattr_search *xs,
+					   u32 name_hash,
+					   int local)
+{
+	int i, ret;
+	handle_t *handle = NULL;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+	     (unsigned long)xi->value_len, xi->name_index,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+
+	if (!xs->bucket.bhs[1]) {
+		ret = ocfs2_read_blocks(inode,
+					xs->bucket.bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					OCFS2_BH_CACHED);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, blk_per_bucket);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		handle = NULL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for (i = 0; i < blk_per_bucket; i++) {
+		ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+
+	/*Only dirty the blocks we have touched in set xattr. */
+	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+						xs->bucket.bhs, blk_per_bucket);
+	if (ret)
+		mlog_errno(ret);
+out:
+	ocfs2_commit_trans(osb, handle);
+
+	return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+					 struct buffer_head *xe_bh,
+					 struct ocfs2_xattr_entry *xe,
+					 u64 new_size)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+
+	handle = ocfs2_start_trans(osb, 1);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xe_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	xe->xe_value_size = cpu_to_le64(new_size);
+
+	ret = ocfs2_journal_dirty(handle, xe_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+					     struct buffer_head *header_bh,
+					     int xe_off,
+					     int len)
+{
+	int ret, offset;
+	u64 value_blk;
+	struct buffer_head *value_bh = NULL;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_xattr_header *xh =
+			(struct ocfs2_xattr_header *)header_bh->b_data;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	xe = &xh->xh_entries[xe_off];
+
+	BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	value_blk = offset / blocksize;
+
+	/* We don't allow ocfs2_xattr_value to be stored in different block. */
+	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+	value_blk += header_bh->b_blocknr;
+
+	ret = ocfs2_read_block(inode, value_blk, &value_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	xv = (struct ocfs2_xattr_value_root *)
+		(value_bh->b_data + offset % blocksize);
+
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	brelse(value_bh);
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						int len)
+{
+	int ret, offset;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+
+	BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = xe - xh->xh_entries;
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+						offset, len);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						struct ocfs2_xattr_search *xs,
+						char *val,
+						int value_len)
+{
+	int offset;
+	struct ocfs2_xattr_value_root *xv;
+	struct ocfs2_xattr_entry *xe = xs->here;
+
+	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+
+	offset = le16_to_cpu(xe->xe_name_offset) +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+				  struct buffer_head *root_bh,
+				  u64 blkno,
+				  u32 cpos,
+				  u32 len)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)root_bh->b_data;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_extent_tree et;
+
+	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+	     cpos, len, (unsigned long long)blkno);
+
+	ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+
+	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (handle == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+				  &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	ocfs2_run_deallocs(osb, &dealloc);
+
+	return ret;
+}
+
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+					 struct ocfs2_xattr_search *xs)
+{
+	handle_t *handle = NULL;
+	struct ocfs2_xattr_header *xh = xs->bucket.xh;
+	struct ocfs2_xattr_entry *last = &xh->xh_entries[
+						le16_to_cpu(xh->xh_count) - 1];
+	int ret = 0;
+
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/* Remove the old entry. */
+	memmove(xs->here, xs->here + 1,
+		(void *)last - (void *)xs->here);
+	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+	le16_add_cpu(&xh->xh_count, -1);
+
+	ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+	if (ret < 0)
+		mlog_errno(ret);
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ *    to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xs)
+{
+	int ret, local = 1;
+	size_t value_len;
+	char *val = (char *)xi->value;
+	struct ocfs2_xattr_entry *xe = xs->here;
+	u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+					      strlen(xi->name));
+
+	if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+		/*
+		 * We need to truncate the xattr storage first.
+		 *
+		 * If both the old and new value are stored to
+		 * outside block, we only need to truncate
+		 * the storage and then set the value outside.
+		 *
+		 * If the new value should be stored within block,
+		 * we should free all the outside block first and
+		 * the modification to the xattr block will be done
+		 * by following steps.
+		 */
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+			value_len = xi->value_len;
+		else
+			value_len = 0;
+
+		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+							   value_len);
+		if (ret)
+			goto out;
+
+		if (value_len)
+			goto set_value_outside;
+	}
+
+	value_len = xi->value_len;
+	/* So we have to handle the inside block change now. */
+	if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/*
+		 * If the new value will be stored outside of block,
+		 * initalize a new empty value root and insert it first.
+		 */
+		local = 0;
+		xi->value = &def_xv;
+		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+	}
+
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+		goto out;
+
+	/* allocate the space now for the outside block storage. */
+	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+						   value_len);
+	if (ret) {
+		mlog_errno(ret);
+
+		if (xs->not_found) {
+			/*
+			 * We can't allocate enough clusters for outside
+			 * storage and we have allocated xattr already,
+			 * so need to remove it.
+			 */
+			ocfs2_xattr_bucket_remove_xs(inode, xs);
+		}
+		goto out;
+	}
+
+set_value_outside:
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+	return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+					      struct ocfs2_xattr_bucket *bucket)
+{
+	struct ocfs2_xattr_header *xh = bucket->xh;
+
+	if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+	    xh->xh_entries[0].xe_name_hash) {
+		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+		     "hash = %u\n",
+		     (unsigned long long)bucket->bhs[0]->b_blocknr,
+		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+		return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+					     struct ocfs2_xattr_info *xi,
+					     struct ocfs2_xattr_search *xs)
+{
+	struct ocfs2_xattr_header *xh;
+	struct ocfs2_xattr_entry *xe;
+	u16 count, header_size, xh_free_start;
+	int i, free, max_free, need, old;
+	size_t value_size = 0, name_len = strlen(xi->name);
+	size_t blocksize = inode->i_sb->s_blocksize;
+	int ret, allocation = 0;
+	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+	xh = xs->header;
+	count = le16_to_cpu(xh->xh_count);
+	xh_free_start = le16_to_cpu(xh->xh_free_start);
+	header_size = sizeof(struct ocfs2_xattr_header) +
+			count * sizeof(struct ocfs2_xattr_entry);
+	max_free = OCFS2_XATTR_BUCKET_SIZE -
+		le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+			"of %u which exceed block size\n",
+			(unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+			header_size);
+
+	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	else if (xi->value)
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (xs->not_found)
+		need = sizeof(struct ocfs2_xattr_entry) +
+			OCFS2_XATTR_SIZE(name_len) + value_size;
+	else {
+		need = value_size + OCFS2_XATTR_SIZE(name_len);
+
+		/*
+		 * We only replace the old value if the new length is smaller
+		 * than the old one. Otherwise we will allocate new space in the
+		 * bucket to store it.
+		 */
+		xe = xs->here;
+		if (ocfs2_xattr_is_local(xe))
+			old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+		else
+			old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+		if (old >= value_size)
+			need = 0;
+	}
+
+	free = xh_free_start - header_size;
+	/*
+	 * We need to make sure the new name/value pair
+	 * can exist in the same block.
+	 */
+	if (xh_free_start % blocksize < need)
+		free -= xh_free_start % blocksize;
+
+	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+	     " %u\n", xs->not_found,
+	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
+	     le16_to_cpu(xh->xh_name_value_len));
+
+	if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+		if (need <= max_free &&
+		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+			/*
+			 * We can create the space by defragment. Since only the
+			 * name/value will be moved, the xe shouldn't be changed
+			 * in xs.
+			 */
+			ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xh_free_start = le16_to_cpu(xh->xh_free_start);
+			free = xh_free_start - header_size;
+			if (xh_free_start % blocksize < need)
+				free -= xh_free_start % blocksize;
+
+			if (free >= need)
+				goto xattr_set;
+
+			mlog(0, "Can't get enough space for xattr insert by "
+			     "defragment. Need %u bytes, but we have %d, so "
+			     "allocate new bucket for it.\n", need, free);
+		}
+
+		/*
+		 * We have to add new buckets or clusters and one
+		 * allocation should leave us enough space for insert.
+		 */
+		BUG_ON(allocation);
+
+		/*
+		 * We do not allow for overlapping ranges between buckets. And
+		 * the maximum number of collisions we will allow for then is
+		 * one bucket's worth, so check it here whether we need to
+		 * add a new bucket for the insert.
+		 */
+		ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_add_new_xattr_bucket(inode,
+						 xs->xattr_bh,
+						 xs->bucket.bhs[0]);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		for (i = 0; i < blk_per_bucket; i++)
+			brelse(xs->bucket.bhs[i]);
+
+		memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+						   xi->name_index,
+						   xi->name, xs);
+		if (ret && ret != -ENODATA)
+			goto out;
+		xs->not_found = ret;
+		allocation = 1;
+		goto try_again;
+	}
+
+xattr_set:
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+	mlog_exit(ret);
+	return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+					struct ocfs2_xattr_bucket *bucket,
+					void *para)
+{
+	int ret = 0;
+	struct ocfs2_xattr_header *xh = bucket->xh;
+	u16 i;
+	struct ocfs2_xattr_entry *xe;
+
+	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+		xe = &xh->xh_entries[i];
+		if (ocfs2_xattr_is_local(xe))
+			continue;
+
+		ret = ocfs2_xattr_bucket_value_truncate(inode,
+							bucket->bhs[0],
+							i, 0);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+					  struct buffer_head *xb_bh)
+{
+	struct ocfs2_xattr_block *xb =
+			(struct ocfs2_xattr_block *)xb_bh->b_data;
+	struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+	int ret = 0;
+	u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+	u64 p_blkno;
+
+	if (le16_to_cpu(el->l_next_free_rec) == 0)
+		return 0;
+
+	while (name_hash > 0) {
+		ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+					  &e_cpos, &num_clusters, el);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+						  ocfs2_delete_xattr_in_bucket,
+						  NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+					     p_blkno, e_cpos, num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+
+		if (e_cpos == 0)
+			break;
+
+		name_hash = e_cpos - 1;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * 'trusted' attributes support
+ */
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+				       size_t list_size, const char *name,
+				       size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+				   void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+				   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ocfs2_xattr_trusted_list,
+	.get	= ocfs2_xattr_trusted_get,
+	.set	= ocfs2_xattr_trusted_set,
+};
+
+
+/*
+ * 'user' attributes support
+ */
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+				    size_t list_size, const char *name,
+				    size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+	const size_t total_len = prefix_len + name_len + 1;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+				void *buffer, size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+				const void *value, size_t size, int flags)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+		return -EOPNOTSUPP;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ocfs2_xattr_user_list,
+	.get	= ocfs2_xattr_user_get,
+	.set	= ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 00000000000..c25c7c62a05
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+	OCFS2_XATTR_INDEX_USER = 1,
+	OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+	OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	OCFS2_XATTR_INDEX_TRUSTED,
+	OCFS2_XATTR_INDEX_SECURITY,
+	OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+			   size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+	return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+	return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+	u16 len = sb->s_blocksize -
+		 offsetof(struct ocfs2_xattr_header, xh_entries);
+
+	return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 66c1ab87656..b675a49c182 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -683,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_FILE_LOCKING
 static int locks_open(struct inode *inode, struct file *filp)
 {
 	return seq_open(filp, &locks_seq_operations);
@@ -694,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif /* CONFIG_FILE_LOCKING */
 
 static int execdomains_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -887,7 +889,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_PRINTK
 	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
 #endif
+#ifdef CONFIG_FILE_LOCKING
 	proc_create("locks", 0, NULL, &proc_locks_operations);
+#endif
 	proc_create("devices", 0, NULL, &proc_devinfo_operations);
 	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK